From 57c7af92112c4f02a887cbf864c4a1b641c3cc56 Mon Sep 17 00:00:00 2001 From: KassieZ <139741991+KassieZ@users.noreply.github.com> Date: Fri, 3 Jan 2025 22:05:11 +0800 Subject: [PATCH] [update] Update sidebar of Getting Started/ Guide / Clickbench (#1710) ## Versions - [ ] dev - [ ] 3.0 - [ ] 2.1 - [ ] 2.0 ## Languages - [ ] Chinese - [ ] English ## Docs Checklist - [ ] Checked by AI - [ ] Test Cases Built --- docs/data-operate/export/export-manual.md | 2 +- .../export/export-with-mysql-dump.md | 2 +- docs/data-operate/export/outfile.md | 2 +- docs/db-connect/arrow-flight-sql-connect.md | 2 +- docs/db-connect/database-connect.md | 2 +- .../tutorials => }/log-storage-analysis.md | 2 +- docs/table-design/data-model/aggregate.md | 4 +- docs/table-design/data-model/duplicate.md | 4 +- docs/table-design/data-model/overview.md | 6 +- docs/table-design/data-model/unique.md | 4 +- docs/table-design/schema-change.md | 2 +- docs/table-design/tiered-storage/overview.md | 2 +- docusaurus.config.js | 18 +- .../current.json | 6 +- .../current/db-connect/database-connect.md | 2 +- .../gettingStarted/what-is-apache-doris.md | 2 +- .../log-storage-analysis.md | 6 +- .../pipeline-execution-engine.md | 56 +- .../table-design/column-compression.md | 8 +- .../table-design/data-model/aggregate.md | 4 +- .../table-design/tiered-storage/overview.md | 4 +- .../building-lakehouse/doris-hudi.md | 314 ---------- .../building-lakehouse/doris-iceberg.md | 306 --------- .../building-lakehouse/doris-lakesoul.md | 349 ----------- .../building-lakehouse/doris-paimon.md | 269 -------- .../gettingStarted/what-is-apache-doris.md | 2 +- .../unique-update-concurrent-control.md | 6 +- .../building-lakehouse/doris-hudi.md | 314 ---------- .../building-lakehouse/doris-iceberg.md | 473 -------------- .../building-lakehouse/doris-lakesoul.md | 349 ----------- .../building-lakehouse/doris-paimon.md | 269 -------- .../gettingStarted/what-is-apache-doris.md | 2 +- .../log-storage-analysis.md | 6 +- .../version-2.1.json | 6 +- .../data-operate/delete/delete-overview.md | 2 +- .../unique-update-concurrent-control.md | 6 +- .../db-connect/database-connect.md | 2 +- .../gettingStarted/what-is-apache-doris.md | 2 +- ...storage-compute-coupled-deploy-manually.md | 2 +- .../log-storage-analysis.md | 6 +- .../pipeline-execution-engine.md | 56 +- .../table-design/column-compression.md | 8 +- .../table-design/data-model/aggregate.md | 4 +- .../table-design/tiered-storage/overview.md | 4 +- .../version-3.0.json | 6 +- .../data-operate/delete/delete-overview.md | 2 +- .../unique-update-concurrent-control.md | 6 +- .../db-connect/database-connect.md | 2 +- .../tutorials/log-storage-analysis.md | 565 ----------------- .../gettingStarted/what-is-apache-doris.md | 2 +- .../log-storage-analysis.md | 6 +- .../pipeline-execution-engine.md | 56 +- .../table-design/column-compression.md | 8 +- .../table-design/data-model/aggregate.md | 4 +- .../table-design/tiered-storage/overview.md | 4 +- sidebars.json | 38 +- src/pages/index.tsx | 2 +- .../export/export_with_mysql_dump.md | 2 +- .../building-lakehouse/doris-hudi.md | 313 ---------- .../building-lakehouse/doris-iceberg.md | 304 --------- .../building-lakehouse/doris-lakesoul.md | 341 ---------- .../building-lakehouse/doris-paimon.md | 270 -------- .../export/export-with-mysql-dump.md | 2 +- .../building-lakehouse/doris-hudi.md | 313 ---------- .../building-lakehouse/doris-iceberg.md | 470 -------------- .../building-lakehouse/doris-lakesoul.md | 341 ---------- .../building-lakehouse/doris-paimon.md | 270 -------- .../table-design/data-model/duplicate.md | 4 +- .../version-2.0/table-design/schema-change.md | 2 +- .../data-operate/export/export-manual.md | 2 +- .../export/export-with-mysql-dump.md | 2 +- .../data-operate/export/outfile.md | 2 +- .../db-connect/arrow-flight-sql-connect.md | 2 +- .../db-connect/database-connect.md | 2 +- .../tutorials/log-storage-analysis.md | 591 ------------------ ...storage-compute-coupled-deploy-manually.md | 2 +- .../log-storage-analysis.md | 2 +- .../practical-guide/log-storage-analysis.md | 2 +- .../table-design/data-model/aggregate.md | 4 +- .../table-design/data-model/duplicate.md | 2 +- .../table-design/data-model/overview.md | 6 +- .../table-design/data-model/unique.md | 4 +- .../version-2.1/table-design/schema-change.md | 2 +- .../table-design/tiered-storage/overview.md | 2 +- .../data-operate/export/export-manual.md | 2 +- .../export/export-with-mysql-dump.md | 2 +- .../data-operate/export/outfile.md | 2 +- .../db-connect/arrow-flight-sql-connect.md | 2 +- .../db-connect/database-connect.md | 2 +- .../tutorials/log-storage-analysis.md | 591 ------------------ .../log-storage-analysis.md | 2 +- .../practical-guide/log-storage-analysis.md | 2 +- .../table-design/data-model/aggregate.md | 4 +- .../table-design/data-model/duplicate.md | 4 +- .../table-design/data-model/overview.md | 6 +- .../table-design/data-model/unique.md | 4 +- .../version-3.0/table-design/schema-change.md | 2 +- .../table-design/tiered-storage/overview.md | 2 +- versioned_sidebars/version-1.2-sidebars.json | 20 +- versioned_sidebars/version-2.0-sidebars.json | 20 +- versioned_sidebars/version-2.1-sidebars.json | 58 +- versioned_sidebars/version-3.0-sidebars.json | 34 +- 102 files changed, 269 insertions(+), 7347 deletions(-) rename docs/{gettingStarted/tutorials => }/log-storage-analysis.md (99%) rename i18n/zh-CN/docusaurus-plugin-content-docs/{version-2.1/gettingStarted/tutorials => current}/log-storage-analysis.md (99%) delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-hudi.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-paimon.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-hudi.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-paimon.md rename i18n/zh-CN/docusaurus-plugin-content-docs/{current/gettingStarted/tutorials => version-2.0}/log-storage-analysis.md (99%) rename i18n/zh-CN/docusaurus-plugin-content-docs/{version-1.2/gettingStarted/tutorials => version-2.1}/log-storage-analysis.md (99%) delete mode 100644 i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/gettingStarted/tutorials/log-storage-analysis.md rename i18n/zh-CN/docusaurus-plugin-content-docs/{version-2.0/gettingStarted/tutorials => version-3.0}/log-storage-analysis.md (99%) delete mode 100644 versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-hudi.md delete mode 100644 versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md delete mode 100644 versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md delete mode 100644 versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-paimon.md delete mode 100644 versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-hudi.md delete mode 100644 versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md delete mode 100644 versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md delete mode 100644 versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-paimon.md delete mode 100644 versioned_docs/version-2.1/gettingStarted/tutorials/log-storage-analysis.md rename versioned_docs/{version-1.2/gettingStarted/tutorials => version-2.1}/log-storage-analysis.md (99%) delete mode 100644 versioned_docs/version-3.0/gettingStarted/tutorials/log-storage-analysis.md rename versioned_docs/{version-2.0/gettingStarted/tutorials => version-3.0}/log-storage-analysis.md (99%) diff --git a/docs/data-operate/export/export-manual.md b/docs/data-operate/export/export-manual.md index 8bdb4f4183088..c88d5119700d4 100644 --- a/docs/data-operate/export/export-manual.md +++ b/docs/data-operate/export/export-manual.md @@ -1,6 +1,6 @@ --- { - "title": "Export", + "title": "Using EXPORT Command", "language": "en" } --- diff --git a/docs/data-operate/export/export-with-mysql-dump.md b/docs/data-operate/export/export-with-mysql-dump.md index 121ce811a99a8..f1aacd79126da 100644 --- a/docs/data-operate/export/export-with-mysql-dump.md +++ b/docs/data-operate/export/export-with-mysql-dump.md @@ -1,6 +1,6 @@ --- { -"title": "MySQL Dump", +"title": "Using MySQL Dump", "language": "en" } --- diff --git a/docs/data-operate/export/outfile.md b/docs/data-operate/export/outfile.md index d51a0047e5062..1ec2ce0946bec 100644 --- a/docs/data-operate/export/outfile.md +++ b/docs/data-operate/export/outfile.md @@ -1,6 +1,6 @@ --- { - "title": "Select Into Outfile", + "title": "Using SELECT INTO OUTFILE Command", "language": "en" } --- diff --git a/docs/db-connect/arrow-flight-sql-connect.md b/docs/db-connect/arrow-flight-sql-connect.md index adfd0dc540a3e..54382516bb980 100644 --- a/docs/db-connect/arrow-flight-sql-connect.md +++ b/docs/db-connect/arrow-flight-sql-connect.md @@ -1,6 +1,6 @@ --- { - "title": "High-speed data transmission link based on Arrow Flight SQL", + "title": "Connecting by Arrow Flight SQL Protocol", "language": "en" } --- diff --git a/docs/db-connect/database-connect.md b/docs/db-connect/database-connect.md index f09c90eaf9a20..bd01e8a68d7d4 100644 --- a/docs/db-connect/database-connect.md +++ b/docs/db-connect/database-connect.md @@ -1,6 +1,6 @@ --- { - "title": "Connecting to Database", + "title": "Connecting by MySQL Protocol", "language": "en" } --- diff --git a/docs/gettingStarted/tutorials/log-storage-analysis.md b/docs/log-storage-analysis.md similarity index 99% rename from docs/gettingStarted/tutorials/log-storage-analysis.md rename to docs/log-storage-analysis.md index 9d54040e66696..dd20c029d04b5 100644 --- a/docs/gettingStarted/tutorials/log-storage-analysis.md +++ b/docs/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "Building log analysis platform", + "title": "Log Storage and Analysis", "language": "en" } --- diff --git a/docs/table-design/data-model/aggregate.md b/docs/table-design/data-model/aggregate.md index 987f909395a46..20fe8b8baee68 100644 --- a/docs/table-design/data-model/aggregate.md +++ b/docs/table-design/data-model/aggregate.md @@ -1,7 +1,7 @@ --- { - "title": "聚合模型", - "language": "zh-CN" + "title": "Aggregate Model", + "language": "en" } --- diff --git a/docs/table-design/data-model/duplicate.md b/docs/table-design/data-model/duplicate.md index f7de4e87a66b4..0e79c65c38038 100644 --- a/docs/table-design/data-model/duplicate.md +++ b/docs/table-design/data-model/duplicate.md @@ -1,7 +1,7 @@ --- { - "title": "明细模型", - "language": "zh-CN" + "title": "Detail Model", + "language": "en" } --- diff --git a/docs/table-design/data-model/overview.md b/docs/table-design/data-model/overview.md index e2578e8c6d7d7..d1de425396593 100644 --- a/docs/table-design/data-model/overview.md +++ b/docs/table-design/data-model/overview.md @@ -1,7 +1,7 @@ --- { - "title": "模型概述", - "language": "zh-CN" + "title": "Table Model Overview", + "language": "en" } --- @@ -34,7 +34,7 @@ Doris supports three types of table models: * **Primary Key Model (Unique Key Model)**: Ensures that each row has a unique Key value, and guarantees that there are no duplicate rows for a given Key column. The Doris storage layer retains only the latest written data for each key, making this model suitable for scenarios that involve data updates. -* **Aggregation Model (Aggregate Key Model)**: Allows data to be aggregated based on the Key columns. The Doris storage layer retains aggregated data, reducing storage space and improving query performance. This model is typically used in situations where summary or aggregated information (such as totals or averages) is required. +* **Aggregate Model (Aggregate Key Model)**: Allows data to be aggregated based on the Key columns. The Doris storage layer retains aggregated data, reducing storage space and improving query performance. This model is typically used in situations where summary or aggregated information (such as totals or averages) is required. Once the table is created, the table model attributes are confirmed and cannot be modified. It is crucial to choose the appropriate model based on business requirements: diff --git a/docs/table-design/data-model/unique.md b/docs/table-design/data-model/unique.md index 52e8ae1e11536..e80e983875cb3 100644 --- a/docs/table-design/data-model/unique.md +++ b/docs/table-design/data-model/unique.md @@ -1,7 +1,7 @@ --- { - "title": "主键模型", - "language": "zh-CN" + "title": "Primary Key Model", + "language": "en" } --- diff --git a/docs/table-design/schema-change.md b/docs/table-design/schema-change.md index 98a9a6794e32c..7b433a1fb2278 100644 --- a/docs/table-design/schema-change.md +++ b/docs/table-design/schema-change.md @@ -1,6 +1,6 @@ --- { - "title": "Schema Evolution", + "title": "Schema Change", "language": "en" } --- diff --git a/docs/table-design/tiered-storage/overview.md b/docs/table-design/tiered-storage/overview.md index 6a7d3af05a336..f9003e67139d4 100644 --- a/docs/table-design/tiered-storage/overview.md +++ b/docs/table-design/tiered-storage/overview.md @@ -1,6 +1,6 @@ --- { - "title": "Tiered Storage", + "title": "Tiered Storage Overview", "language": "en-US" } --- diff --git a/docusaurus.config.js b/docusaurus.config.js index be58c49432ead..ae20ee8a8991a 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -183,14 +183,14 @@ const config = { priority: 0.5, filename: 'sitemap.xml', createSitemapItems: async (params) => { - const {defaultCreateSitemapItems, ...rest} = params; - const items = await defaultCreateSitemapItems(rest); - for(let item of items){ - if(item.url.includes('docs/1.2')){ - item.priority = 0.2; + const { defaultCreateSitemapItems, ...rest } = params; + const items = await defaultCreateSitemapItems(rest); + for (let item of items) { + if (item.url.includes('docs/1.2')) { + item.priority = 0.2; + } } - } - return items; + return items; }, }, }), @@ -243,7 +243,7 @@ const config = { { position: 'left', label: 'Docs', - to: '/docs/gettingStarted/what-is-new', + to: '/docs/gettingStarted/what-is-apache-doris', target: '_blank', }, { to: '/blog', label: 'Blog', position: 'left' }, @@ -344,7 +344,7 @@ const config = { { position: 'left', label: 'Docs', - to: '/docs/gettingStarted/what-is-new', + to: '/docs/gettingStarted/what-is-apache-doris', target: '_blank', }, { to: '/blog', label: 'Blog', position: 'left' }, diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current.json b/i18n/zh-CN/docusaurus-plugin-content-docs/current.json index ff98a3d4b8fb3..ed2f988370b27 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current.json +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current.json @@ -63,9 +63,9 @@ "message": "手动部署集群", "description": "The label for category Cluster Deployment Manually in sidebar docs" }, - "sidebar.docs.category.Deployment on Cloud": { + "sidebar.docs.category.Deploying on Cloud": { "message": "云上部署集群", - "description": "The label for category Deployment on Cloud in sidebar docs" + "description": "The label for category Deploying on Cloud in sidebar docs" }, "sidebar.docs.category.Database Connection": { "message": "数据库连接", @@ -552,7 +552,7 @@ "description": "The label for category Cross Cluster Replication in sidebar docs" }, "sidebar.docs.category.Tiered Storage": { - "message": "分层存储", + "message": "冷热数据分层", "description": "The label for category Tiered Storage in sidebar docs" }, "sidebar.docs.category.Business Continuity & Data Recovery": { diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/db-connect/database-connect.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/db-connect/database-connect.md index d781e92299414..47a2ed6066717 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/db-connect/database-connect.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/db-connect/database-connect.md @@ -1,6 +1,6 @@ --- { - "title": "数据库连接", + "title": "通过 MySQL 协议连接", "language": "zh-CN" } --- diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/gettingStarted/what-is-apache-doris.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/gettingStarted/what-is-apache-doris.md index 601c64c3697e7..c94107c812349 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/gettingStarted/what-is-apache-doris.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/gettingStarted/what-is-apache-doris.md @@ -139,7 +139,7 @@ Apache Doris 查询引擎是向量化的查询引擎,所有的内存结构能 ![Doris 查询引擎是向量化](/images/getting-started/apache-doris-query-engine-2.png) -Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术, 可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 +Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术,可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 在优化器方面,Apache Doris 使用 CBO 和 RBO 结合的优化策略,RBO 支持常量折叠、子查询改写、谓词下推等,CBO 支持 Join Reorder。目前 CBO 还在持续优化中,主要集中在更加精准的统计信息收集和推导,更加精准的代价模型预估等方面。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/gettingStarted/tutorials/log-storage-analysis.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/log-storage-analysis.md similarity index 99% rename from i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/gettingStarted/tutorials/log-storage-analysis.md rename to i18n/zh-CN/docusaurus-plugin-content-docs/current/log-storage-analysis.md index 9669a0ea06df1..e5c7adcac3165 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/gettingStarted/tutorials/log-storage-analysis.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "构建日志存储与分析平台", + "title": "日志存储与分析", "language": "zh-CN" } --- @@ -218,13 +218,13 @@ Apache Doris 对 Flexible Schema 的日志数据提供了几个方面的支持 更多关于分区分桶的信息,可参考 [数据划分](../../table-design/data-partitioning/basic-concepts)。 **配置压缩参数** -- 使用 zstd 压缩算法(`"compression" = "zstd"`), 提高数据压缩率。 +- 使用 zstd 压缩算法 (`"compression" = "zstd"`), 提高数据压缩率。 **配置 Compaction 参数** 按照以下说明配置 Compaction 参数: -- 使用 time_series 策略(`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 +- 使用 time_series 策略 (`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 **建立和配置索引参数** diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md index d3f848618fd9a..186acb51820bd 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md @@ -1,6 +1,6 @@ --- { - "title": "并行执行", + "title": "Pipeline 执行引擎", "language": "zh-CN", "toc_min_heading_level": 2, "toc_max_heading_level": 4 @@ -28,71 +28,71 @@ under the License. -Doris的并行执行模型是一种Pipeline 执行模型,主要参考了[Hyper](https://db.in.tum.de/~leis/papers/morsels.pdf)论文中Pipeline的实现方式,Pipeline 执行模型能够充分释放多核 CPU 的计算能力,并对 Doris 的查询线程的数目进行限制,解决 Doris 的执行线程膨胀的问题。它的具体设计、实现和效果可以参阅 [DSIP-027](DSIP-027: Support Pipeline Exec Engine - DORIS - Apache Software Foundation) 以及 [DSIP-035](DSIP-035: PipelineX Execution Engine - DORIS - Apache Software Foundation)。 -Doris 3.0 之后,Pipeline 执行模型彻底替换了原有的火山模型,基于Pipeline 执行模型,Doris 实现了 Query、DDL、DML 语句的并行处理。 +Doris 的并行执行模型是一种 Pipeline 执行模型,主要参考了[Hyper](https://db.in.tum.de/~leis/papers/morsels.pdf)论文中 Pipeline 的实现方式,Pipeline 执行模型能够充分释放多核 CPU 的计算能力,并对 Doris 的查询线程的数目进行限制,解决 Doris 的执行线程膨胀的问题。它的具体设计、实现和效果可以参阅 [DSIP-027](DSIP-027: Support Pipeline Exec Engine - DORIS - Apache Software Foundation) 以及 [DSIP-035](DSIP-035: PipelineX Execution Engine - DORIS - Apache Software Foundation)。 +Doris 3.0 之后,Pipeline 执行模型彻底替换了原有的火山模型,基于 Pipeline 执行模型,Doris 实现了 Query、DDL、DML 语句的并行处理。 ## 物理计划 -为了更好的理解Pipeline 执行模型,首先需要介绍一下物理查询计划中两个重要的概念:PlanFragment和PlanNode。我们使用下面这条SQL 作为例子: +为了更好的理解 Pipeline 执行模型,首先需要介绍一下物理查询计划中两个重要的概念:PlanFragment 和 PlanNode。我们使用下面这条 SQL 作为例子: ``` SELECT k1, SUM(v1) FROM A,B WHERE A.k2 = B.k2 GROUP BY k1 ORDER BY SUM(v1); ``` -FE 首先会把它翻译成下面这种逻辑计划,计划中每个节点就是一个PlanNode,每种Node的具体含义,可以参考查看物理计划的介绍。 +FE 首先会把它翻译成下面这种逻辑计划,计划中每个节点就是一个 PlanNode,每种 Node 的具体含义,可以参考查看物理计划的介绍。 ![pip_exec_1](/images/pip_exec_1.png) -由于Doris 是一个MPP的架构,每个查询都会尽可能的让所有的BE 都参与进来并行执行,来降低查询的延时。所以还需要将上述逻辑计划拆分为一个物理计划,拆分物理计划基本上就是在逻辑计划中插入了DataSink和ExchangeNode,通过这两个Node完成了数据在多个BE 之间的Shuffle。拆分完成后,每个PlanFragment 相当于包含了一部分PlanNode,可以作为一个独立的任务发送给BE,每个BE 完成了PlanFragment内包含的PlanNode的计算后,通过DataSink和ExchangeNode 这两个算子把数据shuffle到其他BE上来进行接下来的计算。 +由于 Doris 是一个 MPP 的架构,每个查询都会尽可能的让所有的 BE 都参与进来并行执行,来降低查询的延时。所以还需要将上述逻辑计划拆分为一个物理计划,拆分物理计划基本上就是在逻辑计划中插入了 DataSink 和 ExchangeNode,通过这两个 Node 完成了数据在多个 BE 之间的 Shuffle。拆分完成后,每个 PlanFragment 相当于包含了一部分 PlanNode,可以作为一个独立的任务发送给 BE,每个 BE 完成了 PlanFragment 内包含的 PlanNode 的计算后,通过 DataSink 和 ExchangeNode 这两个算子把数据 shuffle 到其他 BE 上来进行接下来的计算。 ![pip_exec_2](/images/pip_exec_2.png) -所以Doris的规划分为3层: -PLAN:执行计划,一个SQL会被执行规划器翻译成一个执行计划,之后执行计划会提供给执行引擎执行。 +所以 Doris 的规划分为 3 层: +PLAN:执行计划,一个 SQL 会被执行规划器翻译成一个执行计划,之后执行计划会提供给执行引擎执行。 -FRAGMENT:由于DORIS是一个分布式执行引擎。一个完整的执行计划会被切分为多个单机的执行片段。一个FRAGMENT表是一个完整的单机执行片段。多个FRAGMENT组合在一起,构成一个完整的PLAN。 +FRAGMENT:由于 DORIS 是一个分布式执行引擎。一个完整的执行计划会被切分为多个单机的执行片段。一个 FRAGMENT 表是一个完整的单机执行片段。多个 FRAGMENT 组合在一起,构成一个完整的 PLAN。 -PLAN NODE:算子,是执行计划的最小单位。一个FRAGMENT由多个算子构成。每一个算子负责一个实际的执行逻辑,比如聚合,连接等 +PLAN NODE:算子,是执行计划的最小单位。一个 FRAGMENT 由多个算子构成。每一个算子负责一个实际的执行逻辑,比如聚合,连接等 ## Pipeline 执行 -PlanFragment 是FE 发往BE 执行任务的最小单位。BE可能会收到同一个Query的多个不同的PlanFragment,每个PlanFragment都会被单独的处理。在收到PlanFragment 之后,BE会把PlanFragment 拆分为多个Pipeline,进而启动多个PipelineTask 来实现并行执行,提升查询效率。 +PlanFragment 是 FE 发往 BE 执行任务的最小单位。BE 可能会收到同一个 Query 的多个不同的 PlanFragment,每个 PlanFragment 都会被单独的处理。在收到 PlanFragment 之后,BE 会把 PlanFragment 拆分为多个 Pipeline,进而启动多个 PipelineTask 来实现并行执行,提升查询效率。 ![pip_exec_3](/images/pip_exec_3.png) ### Pipeline -一个Pipeline 有一个SourceOperator 和 一个SinkOperator 以及中间的多个其他Operator组成。SourceOperator 代表从外部读取数据,可以是一个表(OlapTable),也可以是一个Buffer(Exchange)。SinkOperator 表示数据的输出,输出可以是通过网络shuffle到别的节点,比如DataStreamSinkOperator,也可以是输出到HashTable,比如Agg算子,JoinBuildHashTable等。 +一个 Pipeline 有一个 SourceOperator 和 一个 SinkOperator 以及中间的多个其他 Operator 组成。SourceOperator 代表从外部读取数据,可以是一个表(OlapTable),也可以是一个 Buffer(Exchange)。SinkOperator 表示数据的输出,输出可以是通过网络 shuffle 到别的节点,比如 DataStreamSinkOperator,也可以是输出到 HashTable,比如 Agg 算子,JoinBuildHashTable 等。 ![pip_exec_4](/images/pip_exec_4.png) -多个Pipeline 之间实际是有依赖关系的,以JoinNode为例,他实际被拆分到了2个Pipeline 里。其中Pipeline-0是读取Exchange的数据,来构建HashTable;Pipeline-1 是从表里读取数据,来进行Probe。这2个Pipeline 之间是有关联关系的,只有Pipeline-0运行完毕之后才能执行Pipeline-1。这两者之间的依赖关系,称为Dependency。当Pipeline-0 运行完毕后,会调用Dependency的set_ready 方法通知Pipeline-1 可执行。 +多个 Pipeline 之间实际是有依赖关系的,以 JoinNode 为例,他实际被拆分到了 2 个 Pipeline 里。其中 Pipeline-0 是读取 Exchange 的数据,来构建 HashTable;Pipeline-1 是从表里读取数据,来进行 Probe。这 2 个 Pipeline 之间是有关联关系的,只有 Pipeline-0 运行完毕之后才能执行 Pipeline-1。这两者之间的依赖关系,称为 Dependency。当 Pipeline-0 运行完毕后,会调用 Dependency 的 set_ready 方法通知 Pipeline-1 可执行。 ### PipelineTask -Pipeline 实际还是一个逻辑概念,他并不是一个可执行的实体。在有了Pipeline之后,需要进一步的把Pipeline 实例化为多个PipelineTask。将需要读取的数据分配给不同的PipelineTask 最终实现并行处理。同一个Pipeline的多个PipelineTask 之间的Operator 完全相同,他们的区别在于Operator的状态不一样,比如读取的数据不一样,构建出的HashTable 不一样,这些不一样的状态,我们称之为LocalState。 -每个PipelineTask 最终都会被提交到一个线程池中作为独立的任务执行。在Dependency 这种触发机制下,可以更好的利用多核CPU,实现充分的并行。 +Pipeline 实际还是一个逻辑概念,他并不是一个可执行的实体。在有了 Pipeline 之后,需要进一步的把 Pipeline 实例化为多个 PipelineTask。将需要读取的数据分配给不同的 PipelineTask 最终实现并行处理。同一个 Pipeline 的多个 PipelineTask 之间的 Operator 完全相同,他们的区别在于 Operator 的状态不一样,比如读取的数据不一样,构建出的 HashTable 不一样,这些不一样的状态,我们称之为 LocalState。 +每个 PipelineTask 最终都会被提交到一个线程池中作为独立的任务执行。在 Dependency 这种触发机制下,可以更好的利用多核 CPU,实现充分的并行。 ### Operator -在大多数时候,Pipeline 中的每个Operator 都对应了一个PlanNode,但是有一些特殊的算子除外: -- JoinNode,被拆分为JoinBuildOperator和JoinProbeOperator -- AggNode 被拆分为AggSinkOperator和AggSourceOperator -- SortNode 被拆分为SortSinkOperator 和 SortSourceOperator -基本原理是,对于一些breaking 算子(需要把所有的数据都收集齐之后才能运算的算子),把灌入数据的部分拆分为Sink,然后把从这个算子里获取数据的部分称为Source。 +在大多数时候,Pipeline 中的每个 Operator 都对应了一个 PlanNode,但是有一些特殊的算子除外: +- JoinNode,被拆分为 JoinBuildOperator 和 JoinProbeOperator +- AggNode 被拆分为 AggSinkOperator 和 AggSourceOperator +- SortNode 被拆分为 SortSinkOperator 和 SortSourceOperator +基本原理是,对于一些 breaking 算子(需要把所有的数据都收集齐之后才能运算的算子),把灌入数据的部分拆分为 Sink,然后把从这个算子里获取数据的部分称为 Source。 ## Scan 并行化 -扫描数据是一个非常重的IO 操作,它需要从本地磁盘读取大量的数据(如果是数据湖的场景,就需要从HDFS或者S3中读取,延时更长),需要比较多的时间。所以我们在ScanOperator 中引入了并行扫描的技术,ScanOperator会动态的生成多个Scanner,每个Scanner 扫描100w-200w 行左右的数据,每个Scanner 在做数据扫描时,完成相应的数据解压、过滤等计算任务,然后把数据发送给一个DataQueue,供ScanOperator 读取。 +扫描数据是一个非常重的 IO 操作,它需要从本地磁盘读取大量的数据(如果是数据湖的场景,就需要从 HDFS 或者 S3 中读取,延时更长),需要比较多的时间。所以我们在 ScanOperator 中引入了并行扫描的技术,ScanOperator 会动态的生成多个 Scanner,每个 Scanner 扫描 100w-200w 行左右的数据,每个 Scanner 在做数据扫描时,完成相应的数据解压、过滤等计算任务,然后把数据发送给一个 DataQueue,供 ScanOperator 读取。 ![pip_exec_5](/images/pip_exec_5.png) -通过并行扫描的技术可以有效的避免由于分桶不合理或者数据倾斜导致某些ScanOperator 执行时间特别久,把整个查询的延时都拖慢的问题。 +通过并行扫描的技术可以有效的避免由于分桶不合理或者数据倾斜导致某些 ScanOperator 执行时间特别久,把整个查询的延时都拖慢的问题。 ## Local Shuffle -在Pipeline执行模型中,Local Exchange作为一个Pipeline Breaker出现,是在本地将数据重新分发至各个执行任务的技术。它把上游Pipeline输出的全部数据以某种方式(HASH / Round Robin)均匀分发到下游Pipeline的全部Task中。解决执行过程中的数据倾斜的问题,使执行模型不再受数据存储以及plan的限制。接下来我们举例来说明Local Exchange的工作逻辑。 -我们用上述例子中的Pipeline-1为例子进一步阐述Local Exchange如何可以避免数据倾斜。 +在 Pipeline 执行模型中,Local Exchange 作为一个 Pipeline Breaker 出现,是在本地将数据重新分发至各个执行任务的技术。它把上游 Pipeline 输出的全部数据以某种方式(HASH / Round Robin)均匀分发到下游 Pipeline 的全部 Task 中。解决执行过程中的数据倾斜的问题,使执行模型不再受数据存储以及 plan 的限制。接下来我们举例来说明 Local Exchange 的工作逻辑。 +我们用上述例子中的 Pipeline-1 为例子进一步阐述 Local Exchange 如何可以避免数据倾斜。 ![pip_exec_6](/images/pip_exec_6.png) -如上图所示,首先,通过在Pipeline 1中插入Local Exchange,我们把Pipeline 1进一步拆分成Pipeline 1-0和Pipeline 1-1。 -此时,我们不妨假设当前并发等于3(每个Pipeline有3个task),每个task读取存储层的一个bucket,而3个bucket中数据行数分别是1,1,7。则插入Local Exchange前后的执行变化如下: +如上图所示,首先,通过在 Pipeline 1 中插入 Local Exchange,我们把 Pipeline 1 进一步拆分成 Pipeline 1-0 和 Pipeline 1-1。 +此时,我们不妨假设当前并发等于 3(每个 Pipeline 有 3 个 task),每个 task 读取存储层的一个 bucket,而 3 个 bucket 中数据行数分别是 1,1,7。则插入 Local Exchange 前后的执行变化如下: ![pip_exec_7](/images/pip_exec_7.png) -从图右可以看出,HashJoin和Agg算子需要处理的数据量从(1,1,7)变成了(3,3,3)从而避免了数据倾斜。 -在Doris中,Local Exchange根据一系列规则来决定是否被规划,例如当查询耗时比较大的Join、聚合、窗口函数等算子需要被执行时,我们就需要使用Local Exchange来尽可能避免数据倾斜。 \ No newline at end of file +从图右可以看出,HashJoin 和 Agg 算子需要处理的数据量从 (1,1,7) 变成了 (3,3,3) 从而避免了数据倾斜。 +在 Doris 中,Local Exchange 根据一系列规则来决定是否被规划,例如当查询耗时比较大的 Join、聚合、窗口函数等算子需要被执行时,我们就需要使用 Local Exchange 来尽可能避免数据倾斜。 \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/column-compression.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/column-compression.md index a6a4194b85a4d..7e56a5b6ac443 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/column-compression.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/column-compression.md @@ -1,6 +1,6 @@ --- { - "title": "按列压缩", + "title": "数据压缩", "language": "zh_CN" } --- @@ -44,10 +44,10 @@ Doris 支持多种压缩算法,每种算法在压缩率和解压速度之间 |-------------------------------|------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------| | **无压缩** | - 数据不进行压缩。 | 适用于不需要压缩的场景,例如数据已经被压缩或者存储空间不是问题的情况。 | | **LZ4** | - 压缩和解压速度非常快。
- 压缩比适中。 | 适用于对解压速度要求高的场景,如实时查询或高并发负载。 | -| **LZ4F (LZ4框架)** | - LZ4的扩展版本,支持更灵活的压缩配置。
- 速度快,压缩比适中。 | 适用于需要快速压缩并对配置有细粒度控制的场景。 | -| **LZ4HC (LZ4高压缩)** | - 相比LZ4有更高的压缩比,但压缩速度较慢。
- 解压速度与LZ4相当。 | 适用于需要更高压缩比的场景,同时仍然关注解压速度。 | +| **LZ4F (LZ4 框架)** | - LZ4 的扩展版本,支持更灵活的压缩配置。
- 速度快,压缩比适中。 | 适用于需要快速压缩并对配置有细粒度控制的场景。 | +| **LZ4HC (LZ4 高压缩)** | - 相比 LZ4 有更高的压缩比,但压缩速度较慢。
- 解压速度与 LZ4 相当。 | 适用于需要更高压缩比的场景,同时仍然关注解压速度。 | | **ZSTD (Zstandard)** | - 高压缩比,支持灵活的压缩级别调整。
- 即使在高压缩比下,解压速度仍然很快。 | 适用于对存储效率要求较高且需要平衡查询性能的场景。 | -| **Snappy** | - 设计重点是快速解压。
- 压缩比适中。 | 适用于对解压速度要求高且对CPU消耗低的场景。 | +| **Snappy** | - 设计重点是快速解压。
- 压缩比适中。 | 适用于对解压速度要求高且对 CPU 消耗低的场景。 | | **Zlib** | - 提供良好的压缩比与速度平衡。
- 与其他算法相比,压缩和解压速度较慢,但压缩比更高。 | 适用于对存储效率要求较高且对解压速度不敏感的场景,如归档和冷数据存储。 | diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/data-model/aggregate.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/data-model/aggregate.md index 8ea30edc81f66..79036e7c5f8f5 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/data-model/aggregate.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/data-model/aggregate.md @@ -88,7 +88,7 @@ DISTRIBUTED BY HASH(user_id) BUCKETS 10; * BITMAP_UNION:BIMTAP 类型的列的聚合方式,进行位图的并集聚合。 -:::info 提示: +:::info 提示: 如果以上的聚合方式无法满足业务需求,可以选择使用 agg_state 类型。 ::: @@ -129,7 +129,7 @@ SELECT * FROM example_tbl_agg; ## AGG_STATE -::: info 提示: +:::info 提示: AGG_STATE 是实验特性,建议在开发与测试环境中使用。 ::: diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/tiered-storage/overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/tiered-storage/overview.md index a0df890036e5a..2df6839366077 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/tiered-storage/overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/table-design/tiered-storage/overview.md @@ -1,6 +1,6 @@ --- { - "title": "分层存储", + "title": "冷热数据分层概述", "language": "zh-CN" } --- @@ -30,6 +30,6 @@ under the License. |--------------------|------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------| | **存算分离** | 用户具备部署存算分离的条件 | - 数据以单副本完全存储在对象存储中
- 通过本地缓存加速热数据访问
- 存储与计算资源独立扩展,显著降低存储成本 | | **本地分层** | 存算一体模式下,用户希望进一步优化本地存储资源 | - 支持将冷数据从 SSD 冷却到 HDD
- 充分利用本地存储层级特性,节省高性能存储成本 | -| **远程分层** | 存算一体模式下,使用廉价的对象存储或者 HDFS 进一步降低成本 | - 冷数据以单副本形式保存到对象存储或者 HDFS中
- 热数据继续使用本地存储
- 不能对一个表和本地分层混合使用 | +| **远程分层** | 存算一体模式下,使用廉价的对象存储或者 HDFS 进一步降低成本 | - 冷数据以单副本形式保存到对象存储或者 HDFS 中
- 热数据继续使用本地存储
- 不能对一个表和本地分层混合使用 | 通过上述模式,Doris 能够灵活适配用户的部署条件,实现查询效率与存储成本的平衡。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-hudi.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-hudi.md deleted file mode 100644 index 19afdf3598fa4..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-hudi.md +++ /dev/null @@ -1,314 +0,0 @@ ---- -{ - "title": "使用 Doris 和 Hudi", - "language": "zh-CN" -} - ---- - - - -作为一种全新的开放式的数据管理架构,湖仓一体(Data Lakehouse)融合了数据仓库的高性能、实时性以及数据湖的低成本、灵活性等优势,帮助用户更加便捷地满足各种数据处理分析的需求,在企业的大数据体系中已经得到越来越多的应用。 - -在过去多个版本中,Apache Doris 持续加深与数据湖的融合,当前已演进出一套成熟的湖仓一体解决方案。 - -- 自 0.15 版本起,Apache Doris 引入 Hive 和 Iceberg 外部表,尝试在 Apache Iceberg 之上探索与数据湖的能力结合。 -- 自 1.2 版本起,Apache Doris 正式引入 Multi-Catalog 功能,实现了多种数据源的自动元数据映射和数据访问、并对外部数据读取和查询执行等方面做了诸多性能优化,完全具备了构建极速易用 Lakehouse 架构的能力。 -- 在 2.1 版本中,Apache Doris 湖仓一体架构得到全面加强,不仅增强了主流数据湖格式(Hudi、Iceberg、Paimon 等)的读取和写入能力,还引入了多 SQL 方言兼容、可从原有系统无缝切换至 Apache Doris。在数据科学及大规模数据读取场景上,Doris 集成了 Arrow Flight 高速读取接口,使得数据传输效率实现 100 倍的提升。 - -![使用 Doris 和 Hudi 构建 Lakehouse](/images/lakehouse-architecture-for-doris-and-hudi.png) - -## Apache Doris & Hudi - -[Apache Hudi](https://hudi.apache.org/) 是目前最主流的开放数据湖格式之一,也是事务性的数据湖管理平台,支持包括 Apache Doris 在内的多种主流查询引擎。 - -Apache Doris 同样对 Apache Hudi 数据表的读取能力进行了增强: - -- 支持 Copy on Write Table:Snapshot Query -- 支持 Merge on Read Table:Snapshot Queries, Read Optimized Queries -- 支持 Time Travel -- 支持 Incremental Read - -凭借 Apache Doris 的高性能查询执行以及 Apache Hudi 的实时数据管理能力,可以实现高效、灵活、低成本的数据查询和分析,同时也提供了强大的数据回溯、审计和增量处理功能,当前基于 Apache Doris 和 Apache Hudi 的组合已经在多个社区用户的真实业务场景中得到验证和推广: - -- 实时数据分析与处理:比如金融行业交易分析、广告行业实时点击流分析、电商行业用户行为分析等常见场景下,都要求实时的数据更新及查询分析。Hudi 能够实现对数据的实时更新和管理,并保证数据的一致性和可靠性,Doris 则能够实时高效处理大规模数据查询请求,二者结合能够充分满足实时数据分析与处理的需求。 -- 数据回溯与审计:对于金融、医疗等对数据安全和准确性要求极高的行业来说,数据回溯和审计是非常重要的功能。Hudi 提供了时间旅行(Time Travel)功能,允许用户查看历史数据状态,结合 Apache Doris 高效查询能力,可快速查找分析任何时间点的数据,实现精确的回溯和审计。 -- 增量数据读取与分析:在进行大数据分析时往往面临着数据规模庞大、更新频繁的问题,Hudi 支持增量数据读取,这使得用户可以只需处理变化的数据,不必进行全量数据更新;同时 Apache Doris 的 Incremental Read 功能也可使这一过程更加高效,显著提升了数据处理和分析的效率。 -- 跨数据源联邦查询:许多企业数据来源复杂,数据可能存储在不同的数据库中。Doris 的 Multi-Catalog 功能支持多种数据源的自动映射与同步,支持跨数据源的联邦查询。这对于需要从多个数据源中获取和整合数据进行分析的企业来说,极大地缩短了数据流转路径,提升了工作效率。 - -本文将在 Docker 环境下,为读者介绍如何快速搭建 Apache Doris + Apache Hudi 的测试及演示环境,并对各功能操作进行演示,帮助读者快速入门。 - -关于更多说明,请参阅 [Hudi Catalog](../../../lakehouse/datalake-analytics/hudi) - -## 使用指南 - -本文涉及所有脚本和代码可以从该地址获取:[https://github.com/apache/doris/tree/master/samples/datalake/hudi](https://github.com/apache/doris/tree/master/samples/datalake/hudi) - -### 01 环境准备 - -本文示例采用 Docker Compose 部署,组件及版本号如下: - -| 组件名称 | 版本 | -| --- | --- | -| Apache Doris | 默认 2.1.4,可修改 | -| Apache Hudi | 0.14| -| Apache Spark | 3.4.2| -| Apache Hive | 2.1.3| -| MinIO | 2022-05-26T05-48-41Z| - - -### 02 环境部署 - -1. 创建 Docker 网络 - - `sudo docker network create -d bridge hudi-net` - -2. 启动所有组件 - - `sudo ./start-hudi-compose.sh` - - > 注:启动前,可将 `start-hudi-compose.sh` 中的 `DORIS_PACKAGE` 和 `DORIS_DOWNLOAD_URL` 修改成需要的 Doris 版本。建议使用 2.1.4 或更高版本。 - -3. 启动后,可以使用如下脚本,登陆 Spark 命令行或 Doris 命令行: - - ```sql - -- Doris - sudo ./login-spark.sh - - -- Spark - sudo ./login-doris.sh - ``` - -### 03 数据准备 - -接下来先通过 Spark 生成 Hudi 的数据。如下方代码所示,集群中已经包含一张名为 `customer` 的 Hive 表,可以通过这张 Hive 表,创建一个 Hudi 表: - -```sql --- ./login-spark.sh -spark-sql> use default; - --- create a COW table -spark-sql> CREATE TABLE customer_cow -USING hudi -TBLPROPERTIES ( - type = 'cow', - primaryKey = 'c_custkey', - preCombineField = 'c_name' -) -PARTITIONED BY (c_nationkey) -AS SELECT * FROM customer; - --- create a MOR table -spark-sql> CREATE TABLE customer_mor -USING hudi -TBLPROPERTIES ( - type = 'mor', - primaryKey = 'c_custkey', - preCombineField = 'c_name' -) -PARTITIONED BY (c_nationkey) -AS SELECT * FROM customer; -``` - -### 04 数据查询 - -如下所示,Doris 集群中已经创建了名为 `hudi` 的 Catalog(可通过 `SHOW CATALOGS` 查看)。以下为该 Catalog 的创建语句: - -```sql --- 已经创建,无需再次执行 -CREATE CATALOG `hudi` PROPERTIES ( - "type"="hms", - 'hive.metastore.uris' = 'thrift://hive-metastore:9083', - "s3.access_key" = "minio", - "s3.secret_key" = "minio123", - "s3.endpoint" = "http://minio:9000", - "s3.region" = "us-east-1", - "use_path_style" = "true" -); -``` - -1. 手动刷新该 Catalog,对创建的 Hudi 表进行同步: - - ```sql - -- ./login-doris.sh - doris> REFRESH CATALOG hudi; - ``` - -2. 使用 Spark 操作 Hudi 中的数据,都可以在 Doris 中实时可见,不需要再次刷新 Catalog。我们通过 Spark 分别给 COW 和 MOR 表插入一行数据: - - ```sql - spark-sql> insert into customer_cow values (100, "Customer#000000100", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 25); - spark-sql> insert into customer_mor values (100, "Customer#000000100", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 25); - ``` - -3. 通过 Doris 可以直接查询到最新插入的数据: - - ```sql - doris> use hudi.default; - doris> select * from customer_cow where c_custkey = 100; - doris> select * from customer_mor where c_custkey = 100; - ``` - -4. 再通过 Spark 插入 c_custkey=32 已经存在的数据,即覆盖已有数据: - - ```sql - spark-sql> insert into customer_cow values (32, "Customer#000000032_update", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 15); - spark-sql> insert into customer_mor values (32, "Customer#000000032_update", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 15); - ``` - -5. 通过 Doris 可以查询更新后的数据: - - ```sql - doris> select * from customer_cow where c_custkey = 32; - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - doris> select * from customer_mor where c_custkey = 32; - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - ``` - -### 05 Incremental Read - -Incremental Read 是 Hudi 提供的功能特性之一,通过 Incremental Read,用户可以获取指定时间范围的增量数据,从而实现对数据的增量处理。对此,Doris 可对插入 `c_custkey=100` 后的变更数据进行查询。如下所示,我们插入了一条 `c_custkey=32` 的数据: - -```sql -doris> select * from customer_cow@incr('beginTime'='20240603015018572'); -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -spark-sql> select * from hudi_table_changes('customer_cow', 'latest_state', '20240603015018572'); - -doris> select * from customer_mor@incr('beginTime'='20240603015058442'); -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -spark-sql> select * from hudi_table_changes('customer_mor', 'latest_state', '20240603015058442'); -``` - -### 06 TimeTravel - -Doris 支持查询指定快照版本的 Hudi 数据,从而实现对数据的 Time Travel 功能。首先,可以通过 Spark 查询两张 Hudi 表的提交历史: - -```sql -spark-sql> call show_commits(table => 'customer_cow', limit => 10); -20240603033556094 20240603033558249 commit 448833 0 1 1 183 0 0 -20240603015444737 20240603015446588 commit 450238 0 1 1 202 1 0 -20240603015018572 20240603015020503 commit 436692 1 0 1 1 0 0 -20240603013858098 20240603013907467 commit 44902033 100 0 25 18751 0 0 - -spark-sql> call show_commits(table => 'customer_mor', limit => 10); -20240603033745977 20240603033748021 deltacommit 1240 0 1 1 0 0 0 -20240603015451860 20240603015453539 deltacommit 1434 0 1 1 1 1 0 -20240603015058442 20240603015100120 deltacommit 436691 1 0 1 1 0 0 -20240603013918515 20240603013922961 deltacommit 44904040 100 0 25 18751 0 0 -``` - -接着,可通过 Doris 执行 `c_custkey=32` ,查询数据插入之前的数据快照。如下可看到 `c_custkey=32` 的数据还未更新: - -> 注:Time Travel 语法暂时不支持新优化器,需要先执行 set enable_nereids_planner=false;关闭新优化器,该问题将会在后续版本中修复。 - -```sql -doris> select * from customer_cow for time as of '20240603015018572' where c_custkey = 32 or c_custkey = 100; -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | 15 | -| 100 | Customer#000000100 | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 25 | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ --- compare with spark-sql -spark-sql> select * from customer_mor timestamp as of '20240603015018572' where c_custkey = 32 or c_custkey = 100; - -doris> select * from customer_mor for time as of '20240603015058442' where c_custkey = 32 or c_custkey = 100; -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| 100 | Customer#000000100 | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 25 | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | 15 | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -spark-sql> select * from customer_mor timestamp as of '20240603015058442' where c_custkey = 32 or c_custkey = 100; -``` - -## 查询优化 - -Apache Hudi 中的数据大致可以分为两类 —— 基线数据和增量数据。基线数据通常是已经经过合并的 Parquet 文件,而增量数据是指由 INSERT、UPDATE 或 DELETE 产生的数据增量。基线数据可以直接读取,增量数据需要通过 Merge on Read 的方式进行读取。 - -对于 Hudi COW 表的查询或者 MOR 表的 Read Optimized 查询而言,其数据都属于基线数据,可直接通过 Doris 原生的 Parquet Reader 读取数据文件,且可获得极速的查询响应。而对于增量数据,Doris 需要通过 JNI 调用 Hudi 的 Java SDK 进行访问。为了达到最优的查询性能,Apache Doris 在查询时,会将一个查询中的数据分为基线和增量数据两部分,并分别使用上述方式进行读取。 - -为验证该优化思路,我们通过 EXPLAIN 语句来查看一个下方示例的查询中,分别有多少基线数据和增量数据。对于 COW 表来说,所有 101 个数据分片均为是基线数据(`hudiNativeReadSplits=101/101`),因此 COW 表全部可直接通过 Doris Parquet Reader 进行读取,因此可获得最佳的查询性能。对于 ROW 表,大部分数据分片是基线数据(`hudiNativeReadSplits=100/101`),一个分片数为增量数据,基本也能够获得较好的查询性能。 - -```sql --- COW table is read natively -doris> explain select * from customer_cow where c_custkey = 32; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_cow | -| predicates: (c_custkey[#5] = 32) | -| inputSplitNum=101, totalFileSize=45338886, scanRanges=101 | -| partition=26/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=101/101 | - --- MOR table: because only the base file contains `c_custkey = 32` that is updated, 100 splits are read natively, while the split with log file is read by JNI. -doris> explain select * from customer_mor where c_custkey = 32; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_mor | -| predicates: (c_custkey[#5] = 32) | -| inputSplitNum=101, totalFileSize=45340731, scanRanges=101 | -| partition=26/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=100/101 | -``` - -可以通过 Spark 进行一些删除操作,进一步观察 Hudi 基线数据和增量数据的变化: - -```sql --- Use delete statement to see more differences -spark-sql> delete from customer_cow where c_custkey = 64; -doris> explain select * from customer_cow where c_custkey = 64; - -spark-sql> delete from customer_mor where c_custkey = 64; -doris> explain select * from customer_mor where c_custkey = 64; -``` - -此外,还可以通过分区条件进行分区裁剪,从而进一步减少数据量,以提升查询速度。如下示例中,通过分区条件 `c_nationkey=15` 进行分区裁减,使得查询请求只需要访问一个分区(`partition=1/26`)的数据即可。 - -```sql --- customer_xxx is partitioned by c_nationkey, we can use the partition column to prune data -doris> explain select * from customer_mor where c_custkey = 64 and c_nationkey = 15; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_mor | -| predicates: (c_custkey[#5] = 64), (c_nationkey[#12] = 15) | -| inputSplitNum=4, totalFileSize=1798186, scanRanges=4 | -| partition=1/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=3/4 | -``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md deleted file mode 100644 index 3cc43ab17e47e..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md +++ /dev/null @@ -1,306 +0,0 @@ ---- -{ - "title": "使用 Doris 和 Iceberg", - "language": "zh-CN" -} - ---- - - - -作为一种全新的开放式的数据管理架构,湖仓一体(Data Lakehouse)融合了数据仓库的高性能、实时性以及数据湖的低成本、灵活性等优势,帮助用户更加便捷地满足各种数据处理分析的需求,在企业的大数据体系中已经得到越来越多的应用。 - -在过去多个版本中,Apache Doris 持续加深与数据湖的融合,当前已演进出一套成熟的湖仓一体解决方案。 - -- 自 0.15 版本起,Apache Doris 引入 Hive 和 Iceberg 外部表,尝试在 Apache Iceberg 之上探索与数据湖的能力结合。 -- 自 1.2 版本起,Apache Doris 正式引入 Multi-Catalog 功能,实现了多种数据源的自动元数据映射和数据访问、并对外部数据读取和查询执行等方面做了诸多性能优化,完全具备了构建极速易用 Lakehouse 架构的能力。 -- 在 2.1 版本中,Apache Doris 湖仓一体架构得到全面加强,不仅增强了主流数据湖格式(Hudi、Iceberg、Paimon 等)的读取和写入能力,还引入了多 SQL 方言兼容、可从原有系统无缝切换至 Apache Doris。在数据科学及大规模数据读取场景上,Doris 集成了 Arrow Flight 高速读取接口,使得数据传输效率实现 100 倍的提升。 - -![使用 Doris 和 Iceberg 构建 Lakehouse](/images/lakehouse-architecture-for-doris-and-iceberg.png) - -## Apache Doris & Iceberg - -Apache Iceberg 是一种开源、高性能、高可靠的数据湖表格式,可实现超大规模数据的分析与管理。它支持 Apache Doris 在内的多种主流查询引擎,兼容 HDFS 以及各种对象云存储,具备 ACID、Schema 演进、高级过滤、隐藏分区和分区布局演进等特性,可确保高性能查询以及数据的可靠性及一致性,其时间旅行和版本回滚功能也为数据管理带来较高的灵活性。 - -Apache Doris 对 Iceberg 多项核心特性提供了原生支持: - -- 支持 Hive Metastore、Hadoop、REST、Glue、Google Dataproc Metastore、DLF 等多种 Iceberg Catalog 类型。 -- 原生支持 Iceberg V1/V2 表格式,以及 Position Delete、Equality Delete 文件的读取。 -- 支持通过表函数查询 Iceberg 表快照历史。 -- 支持时间旅行(Time Travel)功能。 -- 原生支持 Iceberg 表引擎。可以通过 Apache Doris 直接创建、管理以及将数据写入到 Iceberg 表。支持完善的分区 Transform 函数,从而提供隐藏分区和分区布局演进等能力。 - -用户可以基于 Apache Doris + Apache Iceberg 快速构建高效的湖仓一体解决方案,以灵活应对实时数据分析与处理的各种需求: - -- 通过 Doris 高性能查询引擎对 Iceberg 表数据和其他数据源进行关联数据分析,构建**统一的联邦数据分析平台**。 -- 通过 Doris 直接管理和构建 Iceberg 表,在 Doris 中完成对数据的清洗、加工并写入到 Iceberg 表,构建**统一的湖仓数据处理平台**。 -- 通过 Iceberg 表引擎,将 Doris 数据共享给其他上下游系统做进一步处理,构建**统一的开放数据存储平台**。 - -未来,Apache Iceberg 将作为 Apache Doris 的原生表引擎之一,提供更加完善的湖格式数据的分析、管理功能。Apache Doris 也将逐步支持包括 Update/Delete/Merge、写回时排序、增量数据读取、元数据管理等 Apache Iceberg 更多高级特性,共同构建统一、高性能、实时的湖仓平台。 - -关于更多说明,请参阅 [Iceberg Catalog](../../../lakehouse/datalake-analytics/iceberg.md) - -## 使用指南 - -本文档主要讲解如何在 Docker 环境下快速搭建 Apache Doris + Apache Iceberg 测试 & 演示环境,并展示各功能的使用操作。 - -本文涉及所有脚本和代码可以从该地址获取:[https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon](https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon) - -### 01 环境准备 - -本文示例采用 Docker Compose 部署,组件及版本号如下: - -| 组件名称 | 版本 | -| --- | --- | -| Apache Doris | 默认 2.1.5,可修改 | -| Apache Iceberg | 1.4.3| -| MinIO | RELEASE.2024-04-29T09-56-05Z| - -### 02 环境部署 - -1. 启动所有组件 - - `bash ./start_all.sh` - -2. 启动后,可以使用如下脚本,登陆 Doris 命令行: - - ```sql - -- login doris - bash ./start_doris_client.sh - ``` - -### 03 创建 Iceberg 表 - -首先登陆 Doris 命令行后,Doris 集群中已经创建了名为 Iceberg 的 Catalog(可通过 `SHOW CATALOGS`/`SHOW CREATE CATALOG iceberg` 查看)。以下为该 Catalog 的创建语句: - -```sql --- 已创建,无需执行 -CREATE CATALOG `iceberg` PROPERTIES ( - "type" = "iceberg", - "iceberg.catalog.type" = "rest", - "warehouse" = "s3://warehouse/", - "uri" = "http://rest:8181", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "http://minio:9000" -); -``` - -在 Iceberg Catalog 创建数据库和 Iceberg 表: - -```sql -mysql> SWITCH iceberg; -Query OK, 0 rows affected (0.00 sec) - -mysql> CREATE DATABASE nyc; -Query OK, 0 rows affected (0.12 sec) - -mysql> CREATE TABLE iceberg.nyc.taxis - ( - vendor_id BIGINT, - trip_id BIGINT, - trip_distance FLOAT, - fare_amount DOUBLE, - store_and_fwd_flag STRING, - ts DATETIME - ) - PARTITION BY LIST (vendor_id, DAY(ts)) () - PROPERTIES ( - "compression-codec" = "zstd", - "write-format" = "parquet" - ); -Query OK, 0 rows affected (0.15 sec) -``` - -### 04 数据写入 - -向 Iceberg 表中插入数据: - -```sql -mysql> INSERT INTO iceberg.nyc.taxis - VALUES - (1, 1000371, 1.8, 15.32, 'N', '2024-01-01 9:15:23'), - (2, 1000372, 2.5, 22.15, 'N', '2024-01-02 12:10:11'), - (2, 1000373, 0.9, 9.01, 'N', '2024-01-01 3:25:15'), - (1, 1000374, 8.4, 42.13, 'Y', '2024-01-03 7:12:33'); -Query OK, 4 rows affected (1.61 sec) -{'status':'COMMITTED', 'txnId':'10085'} -``` - -通过 `CREATE TABLE AS SELECT` 来创建一张 Iceberg 表: - -``` -mysql> CREATE TABLE iceberg.nyc.taxis2 AS SELECT * FROM iceberg.nyc.taxis; -Query OK, 6 rows affected (0.25 sec) -{'status':'COMMITTED', 'txnId':'10088'} -``` - -### 05 数据查询 - -- 简单查询 - - ```sql - mysql> SELECT * FROM iceberg.nyc.taxis; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | - | 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - | 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 4 rows in set (0.37 sec) - - mysql> SELECT * FROM iceberg.nyc.taxis2; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | - | 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - | 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 4 rows in set (0.35 sec) - ``` - -- 分区剪裁 - - ```sql - mysql> SELECT * FROM iceberg.nyc.taxis where vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 1 row in set (0.06 sec) - - mysql> EXPLAIN VERBOSE SELECT * FROM iceberg.nyc.taxis where vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'; - - .... - | 0:VICEBERG_SCAN_NODE(71) - | table: taxis - | predicates: (ts[#5] < '2024-01-02 00:00:00'), (vendor_id[#0] = 2), (ts[#5] >= '2024-01-01 00:00:00') - | inputSplitNum=1, totalFileSize=3539, scanRanges=1 - | partition=1/0 - | backends: - | 10002 - | s3://warehouse/wh/nyc/taxis/data/vendor_id=2/ts_day=2024-01-01/40e6ca404efa4a44-b888f23546d3a69c_5708e229-2f3d-4b68-a66b-44298a9d9815-0.zstd.parquet start: 0 length: 3539 - | cardinality=6, numNodes=1 - | pushdown agg=NONE - | icebergPredicatePushdown= - | ref(name="ts") < 1704153600000000 - | ref(name="vendor_id") == 2 - | ref(name="ts") >= 1704067200000000 - .... - ``` - - 通过 `EXPLAIN VERBOSE` 语句的结果可知,`vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'` 谓词条件,最终只命中一个分区(`partition=1/0`)。 - - 同时也可知,因为在建表时指定了分区 Transform 函数 `DAY(ts)`,原始数据中的的值 `2024-01-01 03:25:15.000000` 会被转换成文件目录中的分区信息 `ts_day=2024-01-01`。 - -### 06 Time Travel - -我们先再次插入几行数据: - -```sql -INSERT INTO iceberg.nyc.taxis VALUES (1, 1000375, 8.8, 55.55, 'Y', '2024-01-01 8:10:22'), (3, 1000376, 7.4, 32.35, 'N', '2024-01-02 1:14:45'); -Query OK, 2 rows affected (0.17 sec) -{'status':'COMMITTED', 'txnId':'10086'} - -mysql> SELECT * FROM iceberg.nyc.taxis; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 3 | 1000376 | 7.4 | 32.35 | N | 2024-01-02 01:14:45.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 1 | 1000375 | 8.8 | 55.55 | Y | 2024-01-01 08:10:22.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -6 rows in set (0.11 sec) -``` - -使用 `iceberg_meta` 表函数查询表的快照信息: - -```sql -mysql> select * from iceberg_meta("table" = "iceberg.nyc.taxis", "query_type" = "snapshots"); -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| committed_at | snapshot_id | parent_id | operation | manifest_list | summary | -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| 2024-07-29 03:38:22 | 8483933166442433486 | -1 | append | s3://warehouse/wh/nyc/taxis/metadata/snap-8483933166442433486-1-5f7b7736-8022-4ba1-9db2-51ae7553be4d.avro | {"added-data-files":"4","added-records":"4","added-files-size":"14156","changed-partition-count":"4","total-records":"4","total-files-size":"14156","total-data-files":"4","total-delete-files":"0","total-position-deletes":"0","total-equality-deletes":"0"} | -| 2024-07-29 03:40:23 | 4726331391239920914 | 8483933166442433486 | append | s3://warehouse/wh/nyc/taxis/metadata/snap-4726331391239920914-1-6aa3d142-6c9c-4553-9c04-08ad4d49a4ea.avro | {"added-data-files":"2","added-records":"2","added-files-size":"7078","changed-partition-count":"2","total-records":"6","total-files-size":"21234","total-data-files":"6","total-delete-files":"0","total-position-deletes":"0","total-equality-deletes":"0"} | -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -2 rows in set (0.07 sec) -``` - -使用 `FOR VERSION AS OF` 语句查询指定快照: - -```sql -mysql> SELECT * FROM iceberg.nyc.taxis FOR VERSION AS OF 8483933166442433486; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.05 sec) - -mysql> SELECT * FROM iceberg.nyc.taxis FOR VERSION AS OF 4726331391239920914; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000375 | 8.8 | 55.55 | Y | 2024-01-01 08:10:22.000000 | -| 3 | 1000376 | 7.4 | 32.35 | N | 2024-01-02 01:14:45.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -6 rows in set (0.04 sec) -``` - -使用 `FOR TIME AS OF` 语句查询指定快照: - -```sql -mysql> SELECT * FROM iceberg.nyc.taxis FOR TIME AS OF "2024-07-29 03:38:23"; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.04 sec) - -mysql> SELECT * FROM iceberg.nyc.taxis FOR TIME AS OF "2024-07-29 03:40:22"; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.05 sec) -``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md deleted file mode 100644 index 8dfba38dac237..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md +++ /dev/null @@ -1,349 +0,0 @@ ---- -{ - "title": "使用 Doris 和 LakeSoul", - "language": "zh-CN" -} - ---- - - - -作为一种全新的开放式的数据管理架构,湖仓一体(Data Lakehouse)融合了数据仓库的高性能、实时性以及数据湖的低成本、灵活性等优势,帮助用户更加便捷地满足各种数据处理分析的需求,在企业的大数据体系中已经得到越来越多的应用。 - -在过去多个版本中,Apache Doris 持续加深与数据湖的融合,当前已演进出一套成熟的湖仓一体解决方案。 - -- 自 0.15 版本起,Apache Doris 引入 Hive 和 Iceberg 外部表,尝试在 Apache Iceberg 之上探索与数据湖的能力结合。 -- 自 1.2 版本起,Apache Doris 正式引入 Multi-Catalog 功能,实现了多种数据源的自动元数据映射和数据访问、并对外部数据读取和查询执行等方面做了诸多性能优化,完全具备了构建极速易用 Lakehouse 架构的能力。 -- 在 2.1 版本中,Apache Doris 湖仓一体架构得到全面加强,不仅增强了主流数据湖格式(Hudi、Iceberg、Paimon 等)的读取和写入能力,还引入了多 SQL 方言兼容、可从原有系统无缝切换至 Apache Doris。在数据科学及大规模数据读取场景上,Doris 集成了 Arrow Flight 高速读取接口,使得数据传输效率实现 100 倍的提升。 - -![使用 Doris 和 LakeSoul 构建 Lakehouse](/images/lakehouse-architecture-for-doris-and-lakesoul.png) - -## Apache Doris & LakeSoul - -LakeSoul 是由数元灵开发的云原生湖仓框架,并在 2023 年 5 月捐赠给了 Linux 基金会 AI & Data 基金会。它以元数据管理的高可扩展性、ACID 事务、高效灵活的 upsert 操作、模式演变和批流集成处理为特点。 - -借助 Apache Doris 的高性能查询引擎和 LakeSoul 的高效数据管理,用户可以实现: - -- 实时数据入湖:利用 LakeSoul 的架构,数据可以以高效率和低延迟入湖,支持包括聚合、去重和部分列更新在内的各种数据更新能力。 -- 高性能数据处理和分析:LakeSoul 的批流集成处理和模式演变等能力可以与 Doris 的强大查询引擎无缝集成,实现湖数据的快速查询和分析响应。 -未来,Apache Doris 将逐步支持 LakeSoul 的更多高级功能,如 CDC 流同步和自动模式演变,共同构建统一的、高性能的、实时的湖仓平台。 - -本文将解释如何快速搭建 Apache Doris + LakeSoul 测试和演示环境,并演示各种功能的使用方法,展示在湖仓架构中使用两个系统集成和优势。 - -关于更多说明,请参阅 [LakeSoul Catalog](../../../lakehouse/datalake-analytics/lakesoul) - -## 使用指南 - -本文涉及所有脚本和代码可以从该地址获取:[https://github.com/apache/doris/tree/master/samples/datalake/lakesoul](https://github.com/apache/doris/tree/master/samples/datalake/lakesoul) - -### 01 环境准备 - -本文示例采用 Docker Compose 部署,组件及版本号如下: - -| 组件名称 | 版本 | -| --- | --- | -| Apache Doris | 默认 3.0.2| -| LakeSoul | 2.6.1 | -| Postgres | 14.5 | -| Apache Spark | 3.3.1 | -| Apache Flink | 1.17 | -| MinIO | RELEASE.2024-04-29T09-56-05Z | - - - -### 02 环境部署 - -1. 启动所有组件 - - - ``` - bash ./start_all.sh - ``` - -2. 启动后,可以使用以下脚本登录到 Doris 命令行: - - ``` - -- login doris - bash ./start_doris_client.sh - ``` - - -### 03 数据查询 - -如下所示,在 Doris 集群中已经创建了一个名为 lakesoul 的 Catalog(可使用 SHOW CATALOGS 查看)。以下是该 Catalog 的创建语句: - -```sql - -- Already created - CREATE CATALOG `lakesoul` PROPERTIES ( - 'type'='lakesoul', - 'lakesoul.pg.username'='lakesoul_test', - 'lakesoul.pg.password'='lakesoul_test', - 'lakesoul.pg.url'='jdbc:postgresql://lakesoul-meta-pg:5432/lakesoul_test?stringtype=unspecified', - 'minio.endpoint'='http://minio:9000', - 'minio.access_key'='admin', - 'minio.secret_key'='password' - ); - - ``` - LakeSoul 表 `lakesoul.tpch.customer` 已加载到 Doris 中。在 Doris 中查询数据。 - -- 查询数据 - ```sql - Doris> use `lakesoul`.`tpch`; - Database changed - - Doris> show tables; - +---------------------+ - | Tables_in_tpch | - +---------------------+ - | customer_from_spark | - +---------------------+ - 1 row in set (0.00 sec) - - Doris> select * from customer_from_spark where c_nationkey = 1 order by c_custkey limit 4; - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - | 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | - | 14 | Customer#000000014 | KXkletMlL2JQEA | 1 | 11-845-129-3851 | 5266.30 | FURNITURE | , ironic packages across the unus | - | 30 | Customer#000000030 | nJDsELGAavU63Jl0c5NKsKfL8rIJQQkQnYL2QJY | 1 | 11-764-165-5076 | 9321.01 | BUILDING | lithely final requests. furiously unusual account | - | 59 | Customer#000000059 | zLOCP0wh92OtBihgspOGl4 | 1 | 11-355-584-3112 | 3458.60 | MACHINERY | ously final packages haggle blithely after the express deposits. furiou | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - 4 rows in set (3.14 sec) - - Doris> select * from customer_from_spark where c_nationkey = 1 order by c_custkey desc limit 4; - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - | 14983 | Customer#000014983 | ERN3vq5Fvt4DL | 1 | 11-424-279-1846 | 841.22 | AUTOMOBILE | furiously slyly special foxes. express theodolites cajole carefully. special dinos haggle pinto | - | 14968 | Customer#000014968 | ,sykKTZBzVFl7ito1750v2TRYwmkRl2nvqGHwmx | 1 | 11-669-222-9657 | 6106.77 | HOUSEHOLD | ts above the furiously even deposits haggle across | - | 14961 | Customer#000014961 | JEIORcsBp6RpLYH 9gNdDyWJ | 1 | 11-490-251-5554 | 4006.35 | HOUSEHOLD | quests detect carefully final platelets! quickly final frays haggle slyly blithely final acc | - | 14940 | Customer#000014940 | bNoyCxPuqSwPLjbqjEUNGN d0mSP | 1 | 11-242-677-1085 | 8829.48 | HOUSEHOLD | ver the quickly express braids. regular dependencies haggle fluffily quickly i | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - 4 rows in set (0.10 sec) - ``` - -- 分区裁剪 - Doris 可以对 LakeSoul 执行分区裁剪,并通过原生读取加速查询过程。我们可以通过 `explain verbose` 来检查这一点。 - - - ```sql - Doris> explain verbose select * from customer_from_spark where c_nationkey < 3; - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | Explain String(Old Planner) | - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | PLAN FRAGMENT 0 | - | OUTPUT EXPRS: | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_custkey` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_name` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_address` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_nationkey` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_phone` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_acctbal` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_mktsegment` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_comment` | - | PARTITION: UNPARTITIONED | - | | - | HAS_COLO_PLAN_NODE: false | - | | - | VRESULT SINK | - | MYSQL_PROTOCAL | - | | - | 1:VEXCHANGE | - | offset: 0 | - | tuple ids: 0 | - | | - | PLAN FRAGMENT 1 | - | | - | PARTITION: RANDOM | - | | - | HAS_COLO_PLAN_NODE: false | - | | - | STREAM DATA SINK | - | EXCHANGE ID: 01 | - | UNPARTITIONED | - | | - | 0:VplanNodeName | - | table: customer_from_spark | - | predicates: (`c_nationkey` < 3) | - | inputSplitNum=12, totalFileSize=0, scanRanges=12 | - | partition=0/0 | - | backends: | - | 10002 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00000-0568c817-d6bc-4fa1-bb9e-b311069b131c_00000.c000.parquet start: 0 length: 0 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00001-d99a8fe6-61ab-4285-94da-2f84f8746a8a_00001.c000.parquet start: 0 length: 0 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00002-8a8e396f-685f-4b0f-87fa-e2a3fe5be87e_00002.c000.parquet start: 0 length: 0 | - | ... other 8 files ... | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=0/part-00003-d5b598cd-5bed-412c-a26f-bb4bc9c937bc_00003.c000.parquet start: 0 length: 0 | - | numNodes=1 | - | pushdown agg=NONE | - | tuple ids: 0 | - | | - | Tuples: | - | TupleDescriptor{id=0, tbl=customer_from_spark} | - | SlotDescriptor{id=0, col=c_custkey, colUniqueId=0, type=int, nullable=false, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=1, col=c_name, colUniqueId=1, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=2, col=c_address, colUniqueId=2, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=3, col=c_nationkey, colUniqueId=3, type=int, nullable=false, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=4, col=c_phone, colUniqueId=4, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=5, col=c_acctbal, colUniqueId=5, type=decimalv3(15,2), nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=6, col=c_mktsegment, colUniqueId=6, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=7, col=c_comment, colUniqueId=7, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - 57 rows in set (0.03 sec) - - ``` - - - 通过检查 `EXPLAIN VERBOSE` 语句的结果,可以看到谓词条件 `c_nationkey < 3` 最终只命中一个分区(partition=0/0)。 - -### 04 CDC 表支持 - -启动 Flink CDC 作业以同步 MySQL 表。MySQL 表在启动 `start_all.sh` 时已经被加载了。 - - -``` -bash start_flink_cdc_job.sh -``` - -```sql -Start flink-cdc job... -SLF4J: Class path contains multiple SLF4J bindings. -SLF4J: Found binding in [jar:file:/opt/flink/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class] -SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class] -SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. -SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory] -Loading class `com.mysql.jdbc.Driver'. This is deprecated. The new driver class is `com.mysql.cj.jdbc.Driver'. The driver is automatically registered via the SPI and manual loading of the driver class is generally unnecessary. -Job has been submitted with JobID d1b3641dcd1ad85c6b373d49b1867e68 - -``` - - -Flink CDC 作业将启动。我们可以通过重新创建 LakeSoul Catalog 在 `doris client` 中检查启动过程。Flink CDC 作业启动后,我们可以在 `doris client` 中看到正在同步的 LakeSoul CDC 表。 - -```sql -Doris> show tables; -+---------------------+ -| Tables_in_tpch | -+---------------------+ -| customer_from_spark | -+---------------------+ -2 rows in set (0.00 sec) - - -Doris> drop catalog if exists lakesoul; -Query OK, 0 rows affected (0.00 sec) - -Doris> create catalog `lakesoul` properties ('type'='lakesoul', 'lakesoul.pg.username'='lakesoul_test', 'lakesoul.pg.password'='lakesoul_test', 'lakesoul.pg.url'='jdbc:postgresql://lakesoul-meta-pg:5432/lakesoul_test?stringtype=unspecified', 'minio.endpoint'='http://minio:9000', 'minio.access_key'='admin', 'minio.secret_key'='password'); -Query OK, 0 rows affected (0.01 sec) - -Doris> show tables; -+---------------------+ -| Tables_in_tpch | -+---------------------+ -| customer | -| customer_from_spark | -+---------------------+ -2 rows in set (0.00 sec) - -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -9 rows in set (1.09 sec) - -``` - -进入 `mysql client` 并尝试修改数据。 - - -``` -bash start_mysql_client.sh -``` - -尝试从 `mysql client` 更新行。 - - -```sql -mysql> update customer set c_acctbal=2211.26 where c_custkey=1; -Query OK, 1 row affected (0.01 sec) -Rows matched: 1 Changed: 1 Warnings: 0 -``` - -回到 `doris client` 并检查数据变化。 - - -```sql -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 2211.26 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -9 rows in set (0.11 sec) - -``` - -尝试从 `mysql client` 删除行。 - - -```sql -mysql> delete from customer where c_custkey = 2; -Query OK, 1 row affected (0.01 sec) -``` - -回到 `doris client` 并检查数据变化。 - - -```sql -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 2211.26 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -8 rows in set (0.11 sec) - -``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-paimon.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-paimon.md deleted file mode 100644 index 5c7b7b98b5b6d..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-paimon.md +++ /dev/null @@ -1,269 +0,0 @@ ---- -{ - "title": "使用 Doris 和 Paimon", - "language": "zh-CN" -} - ---- - - - -作为一种全新的开放式的数据管理架构,湖仓一体(Data Lakehouse)融合了数据仓库的高性能、实时性以及数据湖的低成本、灵活性等优势,帮助用户更加便捷地满足各种数据处理分析的需求,在企业的大数据体系中已经得到越来越多的应用。 - -在过去多个版本中,Apache Doris 持续加深与数据湖的融合,当前已演进出一套成熟的湖仓一体解决方案。 - -- 自 0.15 版本起,Apache Doris 引入 Hive 和 Iceberg 外部表,尝试在 Apache Iceberg 之上探索与数据湖的能力结合。 -- 自 1.2 版本起,Apache Doris 正式引入 Multi-Catalog 功能,实现了多种数据源的自动元数据映射和数据访问、并对外部数据读取和查询执行等方面做了诸多性能优化,完全具备了构建极速易用 Lakehouse 架构的能力。 -- 在 2.1 版本中,Apache Doris 湖仓一体架构得到全面加强,不仅增强了主流数据湖格式(Hudi、Iceberg、Paimon 等)的读取和写入能力,还引入了多 SQL 方言兼容、可从原有系统无缝切换至 Apache Doris。在数据科学及大规模数据读取场景上,Doris 集成了 Arrow Flight 高速读取接口,使得数据传输效率实现 100 倍的提升。 - -![使用 Doris 和 Paimon 构建 Lakehouse](/images/lakehouse-architecture-for-doris-and-paimon.png) - -## Apache Doris & Paimon - -Apache Paimon 是一种数据湖格式,并创新性地将数据湖格式和 LSM 结构的优势相结合,成功将高效的实时流更新能力引入数据湖架构中,这使得 Paimon 能够实现数据的高效管理和实时分析,为构建实时湖仓架构提供了强大的支撑。 - -为了充分发挥 Paimon 的能力,提高对 Paimon 数据的查询效率,Apache Doris 对 Paimon 的多项最新特性提供了原生支持: - -- 支持 Hive Metastore、FileSystem 等多种类型的 Paimon Catalog。 -- 原生支持 Paimon 0.6 版本发布的 Primary Key Table Read Optimized 功能。 -- 原生支持 Paimon 0.8 版本发布的 Primary Key Table Deletion Vector 功能。 - -基于 Apache Doris 的高性能查询引擎和 Apache Paimon 高效的实时流更新能力,用户可以实现: - -- 数据实时入湖:借助 Paimon 的 LSM-Tree 模型,数据入湖的时效性可以降低到分钟级;同时,Paimon 支持包括聚合、去重、部分列更新在内的多种数据更新能力,使得数据流动更加灵活高效。 -- 高性能数据处理分析:Paimon 所提供的 Append Only Table、Read Optimized、Deletion Vector 等技术,可与 Doris 强大的查询引擎对接,实现湖上数据的快速查询及分析响应。 - -未来 Apache Doris 将会逐步支持包括 Time Travel、增量数据读取在内的 Apache Paimon 更多高级特性,共同构建统一、高性能、实时的湖仓平台。 - -本文将会再 Docker 环境中,为读者讲解如何快速搭建 Apache Doris + Apache Paimon 测试 & 演示环境,并展示各功能的使用操作。 - -关于更多说明,请参阅 [Paimon Catalog](../../../lakehouse/datalake-analytics/paimon.md) - -## 使用指南 - -本文涉及所有脚本和代码可以从该地址获取:[https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon](https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon) - -### 01 环境准备 - -本文示例采用 Docker Compose 部署,组件及版本号如下: - -| 组件名称 | 版本 | -| --- | --- | -| Apache Doris | 默认 2.1.5,可修改 | -| Apache Paimon | 0.8| -| Apache Flink | 1.18| -| MinIO | RELEASE.2024-04-29T09-56-05Z| - -### 02 环境部署 - -1. 启动所有组件 - - `bash ./start_all.sh` - -2. 启动后,可以使用如下脚本,登陆 Flink 命令行或 Doris 命令行: - - ```sql - -- login flink - bash ./start_flink_client.sh - - -- login doris - bash ./start_doris_client.sh - ``` - -### 03 数据准备 - -首先登陆 Flink 命令行后,可以看到一张预构建的表。表中已经包含一些数据,我们可以通过 Flink SQL 进行查看。 - -```sql -Flink SQL> use paimon.db_paimon; -[INFO] Execute statement succeed. - -Flink SQL> show tables; -+------------+ -| table name | -+------------+ -| customer | -+------------+ -1 row in set - -Flink SQL> show create table customer; -+------------------------------------------------------------------------+ -| result | -+------------------------------------------------------------------------+ -| CREATE TABLE `paimon`.`db_paimon`.`customer` ( - `c_custkey` INT NOT NULL, - `c_name` VARCHAR(25), - `c_address` VARCHAR(40), - `c_nationkey` INT NOT NULL, - `c_phone` CHAR(15), - `c_acctbal` DECIMAL(12, 2), - `c_mktsegment` CHAR(10), - `c_comment` VARCHAR(117), - CONSTRAINT `PK_c_custkey_c_nationkey` PRIMARY KEY (`c_custkey`, `c_nationkey`) NOT ENFORCED -) PARTITIONED BY (`c_nationkey`) -WITH ( - 'bucket' = '1', - 'path' = 's3://warehouse/wh/db_paimon.db/customer', - 'deletion-vectors.enabled' = 'true' -) - | -+-------------------------------------------------------------------------+ -1 row in set - -Flink SQL> desc customer; -+--------------+----------------+-------+-----------------------------+--------+-----------+ -| name | type | null | key | extras | watermark | -+--------------+----------------+-------+-----------------------------+--------+-----------+ -| c_custkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | -| c_name | VARCHAR(25) | TRUE | | | | -| c_address | VARCHAR(40) | TRUE | | | | -| c_nationkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | -| c_phone | CHAR(15) | TRUE | | | | -| c_acctbal | DECIMAL(12, 2) | TRUE | | | | -| c_mktsegment | CHAR(10) | TRUE | | | | -| c_comment | VARCHAR(117) | TRUE | | | | -+--------------+----------------+-------+-----------------------------+--------+-----------+ -8 rows in set - -Flink SQL> select * from customer order by c_custkey limit 4; -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platel... | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic... | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic,... | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tl... | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious ... | -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -4 rows in set -``` - -### 04 数据查询 - -如下所示,Doris 集群中已经创建了名为 `paimon` 的 Catalog(可通过 SHOW CATALOGS 查看)。以下为该 Catalog 的创建语句: - -```sql --- 已创建,无需执行 -CREATE CATALOG `paimon` PROPERTIES ( - "type" = "paimon", - "warehouse" = "s3://warehouse/wh/", - "s3.endpoint"="http://minio:9000", - "s3.access_key"="admin", - "s3.secret_key"="password", - "s3.region"="us-east-1" -); -``` - -你可登录到 Doris 中查询 Paimon 的数据: - -```sql -mysql> use paimon.db_paimon; -Reading table information for completion of table and column names -You can turn off this feature to get a quicker startup with -A - -Database changed -mysql> show tables; -+---------------------+ -| Tables_in_db_paimon | -+---------------------+ -| customer | -+---------------------+ -1 row in set (0.00 sec) - -mysql> select * from customer order by c_custkey limit 4; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -4 rows in set (1.89 sec) -``` - -### 05 读取增量数据 - -我们可以通过 Flink SQL 更新 Paimon 表中的数据: - -```sql -Flink SQL> update customer set c_address='c_address_update' where c_nationkey = 1; -[INFO] Submitting SQL update statement to the cluster... -[INFO] SQL update statement has been successfully submitted to the cluster: -Job ID: ff838b7b778a94396b332b0d93c8f7ac -``` - -等 Flink SQL 执行完毕后,在 Doris 中可直接查看到最新的数据: - -```sql -mysql> select * from customer where c_nationkey=1 limit 2; -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| 3 | Customer#000000003 | c_address_update | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 513 | Customer#000000513 | c_address_update | 1 | 11-861-303-6887 | 955.37 | HOUSEHOLD | press along the quickly regular instructions. regular requests against the carefully ironic s | -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -2 rows in set (0.19 sec) -``` - -### Benchmark - -我们在 Paimon(0.8)版本的 TPCDS 1000 数据集上进行了简单的测试,分别使用了 Apache Doris 2.1.5 版本和 Trino 422 版本,均开启 Primary Key Table Read Optimized 功能。 - -![](/images/quick-start/lakehouse-paimon-benchmark.PNG) - -从测试结果可以看到,Doris 在标准静态测试集上的平均查询性能是 Trino 的 3~5 倍。后续我们将针对 Deletion Vector 进行优化,进一步提升真实业务场景下的查询效率。 - -## 查询优化 - -对于基线数据来说,Apache Paimon 在 0.6 版本中引入 Primary Key Table Read Optimized 功能后,使得查询引擎可以直接访问底层的 Parquet/ORC 文件,大幅提升了基线数据的读取效率。对于尚未合并的增量数据(INSERT、UPDATE 或 DELETE 所产生的数据增量)来说,可以通过 Merge-on-Read 的方式进行读取。此外,Paimon 在 0.8 版本中还引入的 Deletion Vector 功能,能够进一步提升查询引擎对增量数据的读取效率。 -Apache Doris 支持通过原生的 Reader 读取 Deletion Vector 并进行 Merge on Read,我们通过 Doris 的 EXPLAIN 语句,来演示在一个查询中,基线数据和增量数据的查询方式。 - -```sql -mysql> explain verbose select * from customer where c_nationkey < 3; -+------------------------------------------------------------------------------------------------------------------------------------------------+ -| Explain String(Nereids Planner) | -+------------------------------------------------------------------------------------------------------------------------------------------------+ -| ............... | -| | -| 0:VPAIMON_SCAN_NODE(68) | -| table: customer | -| predicates: (c_nationkey[#3] < 3) | -| inputSplitNum=4, totalFileSize=238324, scanRanges=4 | -| partition=3/0 | -| backends: | -| 10002 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-15cee5b7-1bd7-42ca-9314-56d92c62c03b-0.orc start: 0 length: 66600 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-5d50255a-2215-4010-b976-d5dc656f3444-0.orc start: 0 length: 44501 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=2/bucket-0/data-e98fb7ef-ec2b-4ad5-a496-713cb9481d56-0.orc start: 0 length: 64059 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=0/bucket-0/data-431be05d-50fa-401f-9680-d646757d0f95-0.orc start: 0 length: 63164 | -| cardinality=18751, numNodes=1 | -| pushdown agg=NONE | -| paimonNativeReadSplits=4/4 | -| PaimonSplitStats: | -| SplitStat [type=NATIVE, rowCount=1542, rawFileConvertable=true, hasDeletionVector=true] | -| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | -| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | -| tuple ids: 0 -| ............... | | -+------------------------------------------------------------------------------------------------------------------------------------------------+ -67 rows in set (0.23 sec) -``` - -可以看到,对于刚才通过 Flink SQL 更新的表,包含 4 个分片,并且全部分片都可以通过 Native Reader 进行访问(`paimonNativeReadSplits=4/4`)。并且第一个分片的`hasDeletionVector`的属性为`true`,表示该分片有对应的 Deletion Vector,读取时会根据 Deletion Vector 进行数据过滤。 \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/what-is-apache-doris.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/what-is-apache-doris.md index b47121e05b22b..6a8d89c14e980 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/what-is-apache-doris.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/what-is-apache-doris.md @@ -104,6 +104,6 @@ Apache Doris 也支持**强一致的物化视图**,物化视图的更新和选 ![Doris 查询引擎是向量化](/images/getting-started/apache-doris-query-engine-2.png) -**Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术,**可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 +**Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术,** 可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 在**优化器**方面,Apache Doris 使用 CBO 和 RBO 结合的优化策略,RBO 支持常量折叠、子查询改写、谓词下推等,CBO 支持 Join Reorder。目前 CBO 还在持续优化中,主要集中在更加精准的统计信息收集和推导,更加精准的代价模型预估等方面。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/data-operate/update/unique-update-concurrent-control.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/data-operate/update/unique-update-concurrent-control.md index 7fb631fc893d1..76a43de69ef46 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/data-operate/update/unique-update-concurrent-control.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/data-operate/update/unique-update-concurrent-control.md @@ -28,7 +28,7 @@ under the License. Doris 采用多版本并发控制机制(MVCC - Multi-Version Concurrency Control)来管理并发更新。每次数据写入操作均会分配一个写入事务,该事务确保数据写入的原子性(即写入操作要么完全成功,要么完全失败)。在写入事务提交时,系统会为其分配一个版本号。当用户使用 Unique Key 模型并多次导入数据时,如果存在重复主键,Doris 会根据版本号确定覆盖顺序:版本号较高的数据会覆盖版本号较低的数据。 -在某些场景中,用户可能需要通过在建表语句中指定 sequence 列来灵活调整数据的生效顺序。例如,当通过多线程并发同步数据到 Doris 时,不同线程的数据可能会乱序到达。这种情况下,可能出现旧数据因较晚到达而错误覆盖新数据的情况。为解决这一问题,用户可以为旧数据指定较低的 sequence 值,为新数据指定较高的 sequence 值,从而让 Doris 根据用户提供的 sequence值来正确确定数据的更新顺序。 +在某些场景中,用户可能需要通过在建表语句中指定 sequence 列来灵活调整数据的生效顺序。例如,当通过多线程并发同步数据到 Doris 时,不同线程的数据可能会乱序到达。这种情况下,可能出现旧数据因较晚到达而错误覆盖新数据的情况。为解决这一问题,用户可以为旧数据指定较低的 sequence 值,为新数据指定较高的 sequence 值,从而让 Doris 根据用户提供的 sequence 值来正确确定数据的更新顺序。 此外,`UPDATE` 语句与通过导入实现更新在底层机制上存在较大差异。`UPDATE` 操作涉及两个步骤:从数据库中读取待更新的数据,以及写入更新后的数据。默认情况下,`UPDATE` 语句通过表级锁提供了 Serializable 隔离级别的事务能力,即多个 `UPDATE` 操作只能串行执行。用户也可以通过调整配置绕过这一限制,具体方法请参阅以下章节的详细说明。 @@ -72,7 +72,7 @@ sequence 列目前只支持 Unique 模型。 **Sequence 列建表时有两种方式,一种是建表时设置`sequence_col`属性,一种是建表时设置`sequence_type`属性。** -**1. 设置****`sequence_col`(推荐)** +**1. 设置`sequence_col`(推荐)** 创建 Unique 表时,指定 sequence 列到表中其他 column 的映射 @@ -86,7 +86,7 @@ sequence_col 用来指定 sequence 列到表中某一列的映射,该列可以 导入方式和没有 sequence 列时一样,使用相对比较简单,推荐使用。 -**2. 设置****`sequence_type`** +**2. 设置 `sequence_type`** 创建 Unique 表时,指定 sequence 列类型 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-hudi.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-hudi.md deleted file mode 100644 index 19afdf3598fa4..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-hudi.md +++ /dev/null @@ -1,314 +0,0 @@ ---- -{ - "title": "使用 Doris 和 Hudi", - "language": "zh-CN" -} - ---- - - - -作为一种全新的开放式的数据管理架构,湖仓一体(Data Lakehouse)融合了数据仓库的高性能、实时性以及数据湖的低成本、灵活性等优势,帮助用户更加便捷地满足各种数据处理分析的需求,在企业的大数据体系中已经得到越来越多的应用。 - -在过去多个版本中,Apache Doris 持续加深与数据湖的融合,当前已演进出一套成熟的湖仓一体解决方案。 - -- 自 0.15 版本起,Apache Doris 引入 Hive 和 Iceberg 外部表,尝试在 Apache Iceberg 之上探索与数据湖的能力结合。 -- 自 1.2 版本起,Apache Doris 正式引入 Multi-Catalog 功能,实现了多种数据源的自动元数据映射和数据访问、并对外部数据读取和查询执行等方面做了诸多性能优化,完全具备了构建极速易用 Lakehouse 架构的能力。 -- 在 2.1 版本中,Apache Doris 湖仓一体架构得到全面加强,不仅增强了主流数据湖格式(Hudi、Iceberg、Paimon 等)的读取和写入能力,还引入了多 SQL 方言兼容、可从原有系统无缝切换至 Apache Doris。在数据科学及大规模数据读取场景上,Doris 集成了 Arrow Flight 高速读取接口,使得数据传输效率实现 100 倍的提升。 - -![使用 Doris 和 Hudi 构建 Lakehouse](/images/lakehouse-architecture-for-doris-and-hudi.png) - -## Apache Doris & Hudi - -[Apache Hudi](https://hudi.apache.org/) 是目前最主流的开放数据湖格式之一,也是事务性的数据湖管理平台,支持包括 Apache Doris 在内的多种主流查询引擎。 - -Apache Doris 同样对 Apache Hudi 数据表的读取能力进行了增强: - -- 支持 Copy on Write Table:Snapshot Query -- 支持 Merge on Read Table:Snapshot Queries, Read Optimized Queries -- 支持 Time Travel -- 支持 Incremental Read - -凭借 Apache Doris 的高性能查询执行以及 Apache Hudi 的实时数据管理能力,可以实现高效、灵活、低成本的数据查询和分析,同时也提供了强大的数据回溯、审计和增量处理功能,当前基于 Apache Doris 和 Apache Hudi 的组合已经在多个社区用户的真实业务场景中得到验证和推广: - -- 实时数据分析与处理:比如金融行业交易分析、广告行业实时点击流分析、电商行业用户行为分析等常见场景下,都要求实时的数据更新及查询分析。Hudi 能够实现对数据的实时更新和管理,并保证数据的一致性和可靠性,Doris 则能够实时高效处理大规模数据查询请求,二者结合能够充分满足实时数据分析与处理的需求。 -- 数据回溯与审计:对于金融、医疗等对数据安全和准确性要求极高的行业来说,数据回溯和审计是非常重要的功能。Hudi 提供了时间旅行(Time Travel)功能,允许用户查看历史数据状态,结合 Apache Doris 高效查询能力,可快速查找分析任何时间点的数据,实现精确的回溯和审计。 -- 增量数据读取与分析:在进行大数据分析时往往面临着数据规模庞大、更新频繁的问题,Hudi 支持增量数据读取,这使得用户可以只需处理变化的数据,不必进行全量数据更新;同时 Apache Doris 的 Incremental Read 功能也可使这一过程更加高效,显著提升了数据处理和分析的效率。 -- 跨数据源联邦查询:许多企业数据来源复杂,数据可能存储在不同的数据库中。Doris 的 Multi-Catalog 功能支持多种数据源的自动映射与同步,支持跨数据源的联邦查询。这对于需要从多个数据源中获取和整合数据进行分析的企业来说,极大地缩短了数据流转路径,提升了工作效率。 - -本文将在 Docker 环境下,为读者介绍如何快速搭建 Apache Doris + Apache Hudi 的测试及演示环境,并对各功能操作进行演示,帮助读者快速入门。 - -关于更多说明,请参阅 [Hudi Catalog](../../../lakehouse/datalake-analytics/hudi) - -## 使用指南 - -本文涉及所有脚本和代码可以从该地址获取:[https://github.com/apache/doris/tree/master/samples/datalake/hudi](https://github.com/apache/doris/tree/master/samples/datalake/hudi) - -### 01 环境准备 - -本文示例采用 Docker Compose 部署,组件及版本号如下: - -| 组件名称 | 版本 | -| --- | --- | -| Apache Doris | 默认 2.1.4,可修改 | -| Apache Hudi | 0.14| -| Apache Spark | 3.4.2| -| Apache Hive | 2.1.3| -| MinIO | 2022-05-26T05-48-41Z| - - -### 02 环境部署 - -1. 创建 Docker 网络 - - `sudo docker network create -d bridge hudi-net` - -2. 启动所有组件 - - `sudo ./start-hudi-compose.sh` - - > 注:启动前,可将 `start-hudi-compose.sh` 中的 `DORIS_PACKAGE` 和 `DORIS_DOWNLOAD_URL` 修改成需要的 Doris 版本。建议使用 2.1.4 或更高版本。 - -3. 启动后,可以使用如下脚本,登陆 Spark 命令行或 Doris 命令行: - - ```sql - -- Doris - sudo ./login-spark.sh - - -- Spark - sudo ./login-doris.sh - ``` - -### 03 数据准备 - -接下来先通过 Spark 生成 Hudi 的数据。如下方代码所示,集群中已经包含一张名为 `customer` 的 Hive 表,可以通过这张 Hive 表,创建一个 Hudi 表: - -```sql --- ./login-spark.sh -spark-sql> use default; - --- create a COW table -spark-sql> CREATE TABLE customer_cow -USING hudi -TBLPROPERTIES ( - type = 'cow', - primaryKey = 'c_custkey', - preCombineField = 'c_name' -) -PARTITIONED BY (c_nationkey) -AS SELECT * FROM customer; - --- create a MOR table -spark-sql> CREATE TABLE customer_mor -USING hudi -TBLPROPERTIES ( - type = 'mor', - primaryKey = 'c_custkey', - preCombineField = 'c_name' -) -PARTITIONED BY (c_nationkey) -AS SELECT * FROM customer; -``` - -### 04 数据查询 - -如下所示,Doris 集群中已经创建了名为 `hudi` 的 Catalog(可通过 `SHOW CATALOGS` 查看)。以下为该 Catalog 的创建语句: - -```sql --- 已经创建,无需再次执行 -CREATE CATALOG `hudi` PROPERTIES ( - "type"="hms", - 'hive.metastore.uris' = 'thrift://hive-metastore:9083', - "s3.access_key" = "minio", - "s3.secret_key" = "minio123", - "s3.endpoint" = "http://minio:9000", - "s3.region" = "us-east-1", - "use_path_style" = "true" -); -``` - -1. 手动刷新该 Catalog,对创建的 Hudi 表进行同步: - - ```sql - -- ./login-doris.sh - doris> REFRESH CATALOG hudi; - ``` - -2. 使用 Spark 操作 Hudi 中的数据,都可以在 Doris 中实时可见,不需要再次刷新 Catalog。我们通过 Spark 分别给 COW 和 MOR 表插入一行数据: - - ```sql - spark-sql> insert into customer_cow values (100, "Customer#000000100", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 25); - spark-sql> insert into customer_mor values (100, "Customer#000000100", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 25); - ``` - -3. 通过 Doris 可以直接查询到最新插入的数据: - - ```sql - doris> use hudi.default; - doris> select * from customer_cow where c_custkey = 100; - doris> select * from customer_mor where c_custkey = 100; - ``` - -4. 再通过 Spark 插入 c_custkey=32 已经存在的数据,即覆盖已有数据: - - ```sql - spark-sql> insert into customer_cow values (32, "Customer#000000032_update", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 15); - spark-sql> insert into customer_mor values (32, "Customer#000000032_update", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 15); - ``` - -5. 通过 Doris 可以查询更新后的数据: - - ```sql - doris> select * from customer_cow where c_custkey = 32; - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - doris> select * from customer_mor where c_custkey = 32; - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - ``` - -### 05 Incremental Read - -Incremental Read 是 Hudi 提供的功能特性之一,通过 Incremental Read,用户可以获取指定时间范围的增量数据,从而实现对数据的增量处理。对此,Doris 可对插入 `c_custkey=100` 后的变更数据进行查询。如下所示,我们插入了一条 `c_custkey=32` 的数据: - -```sql -doris> select * from customer_cow@incr('beginTime'='20240603015018572'); -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -spark-sql> select * from hudi_table_changes('customer_cow', 'latest_state', '20240603015018572'); - -doris> select * from customer_mor@incr('beginTime'='20240603015058442'); -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -spark-sql> select * from hudi_table_changes('customer_mor', 'latest_state', '20240603015058442'); -``` - -### 06 TimeTravel - -Doris 支持查询指定快照版本的 Hudi 数据,从而实现对数据的 Time Travel 功能。首先,可以通过 Spark 查询两张 Hudi 表的提交历史: - -```sql -spark-sql> call show_commits(table => 'customer_cow', limit => 10); -20240603033556094 20240603033558249 commit 448833 0 1 1 183 0 0 -20240603015444737 20240603015446588 commit 450238 0 1 1 202 1 0 -20240603015018572 20240603015020503 commit 436692 1 0 1 1 0 0 -20240603013858098 20240603013907467 commit 44902033 100 0 25 18751 0 0 - -spark-sql> call show_commits(table => 'customer_mor', limit => 10); -20240603033745977 20240603033748021 deltacommit 1240 0 1 1 0 0 0 -20240603015451860 20240603015453539 deltacommit 1434 0 1 1 1 1 0 -20240603015058442 20240603015100120 deltacommit 436691 1 0 1 1 0 0 -20240603013918515 20240603013922961 deltacommit 44904040 100 0 25 18751 0 0 -``` - -接着,可通过 Doris 执行 `c_custkey=32` ,查询数据插入之前的数据快照。如下可看到 `c_custkey=32` 的数据还未更新: - -> 注:Time Travel 语法暂时不支持新优化器,需要先执行 set enable_nereids_planner=false;关闭新优化器,该问题将会在后续版本中修复。 - -```sql -doris> select * from customer_cow for time as of '20240603015018572' where c_custkey = 32 or c_custkey = 100; -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | 15 | -| 100 | Customer#000000100 | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 25 | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ --- compare with spark-sql -spark-sql> select * from customer_mor timestamp as of '20240603015018572' where c_custkey = 32 or c_custkey = 100; - -doris> select * from customer_mor for time as of '20240603015058442' where c_custkey = 32 or c_custkey = 100; -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| 100 | Customer#000000100 | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 25 | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | 15 | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -spark-sql> select * from customer_mor timestamp as of '20240603015058442' where c_custkey = 32 or c_custkey = 100; -``` - -## 查询优化 - -Apache Hudi 中的数据大致可以分为两类 —— 基线数据和增量数据。基线数据通常是已经经过合并的 Parquet 文件,而增量数据是指由 INSERT、UPDATE 或 DELETE 产生的数据增量。基线数据可以直接读取,增量数据需要通过 Merge on Read 的方式进行读取。 - -对于 Hudi COW 表的查询或者 MOR 表的 Read Optimized 查询而言,其数据都属于基线数据,可直接通过 Doris 原生的 Parquet Reader 读取数据文件,且可获得极速的查询响应。而对于增量数据,Doris 需要通过 JNI 调用 Hudi 的 Java SDK 进行访问。为了达到最优的查询性能,Apache Doris 在查询时,会将一个查询中的数据分为基线和增量数据两部分,并分别使用上述方式进行读取。 - -为验证该优化思路,我们通过 EXPLAIN 语句来查看一个下方示例的查询中,分别有多少基线数据和增量数据。对于 COW 表来说,所有 101 个数据分片均为是基线数据(`hudiNativeReadSplits=101/101`),因此 COW 表全部可直接通过 Doris Parquet Reader 进行读取,因此可获得最佳的查询性能。对于 ROW 表,大部分数据分片是基线数据(`hudiNativeReadSplits=100/101`),一个分片数为增量数据,基本也能够获得较好的查询性能。 - -```sql --- COW table is read natively -doris> explain select * from customer_cow where c_custkey = 32; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_cow | -| predicates: (c_custkey[#5] = 32) | -| inputSplitNum=101, totalFileSize=45338886, scanRanges=101 | -| partition=26/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=101/101 | - --- MOR table: because only the base file contains `c_custkey = 32` that is updated, 100 splits are read natively, while the split with log file is read by JNI. -doris> explain select * from customer_mor where c_custkey = 32; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_mor | -| predicates: (c_custkey[#5] = 32) | -| inputSplitNum=101, totalFileSize=45340731, scanRanges=101 | -| partition=26/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=100/101 | -``` - -可以通过 Spark 进行一些删除操作,进一步观察 Hudi 基线数据和增量数据的变化: - -```sql --- Use delete statement to see more differences -spark-sql> delete from customer_cow where c_custkey = 64; -doris> explain select * from customer_cow where c_custkey = 64; - -spark-sql> delete from customer_mor where c_custkey = 64; -doris> explain select * from customer_mor where c_custkey = 64; -``` - -此外,还可以通过分区条件进行分区裁剪,从而进一步减少数据量,以提升查询速度。如下示例中,通过分区条件 `c_nationkey=15` 进行分区裁减,使得查询请求只需要访问一个分区(`partition=1/26`)的数据即可。 - -```sql --- customer_xxx is partitioned by c_nationkey, we can use the partition column to prune data -doris> explain select * from customer_mor where c_custkey = 64 and c_nationkey = 15; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_mor | -| predicates: (c_custkey[#5] = 64), (c_nationkey[#12] = 15) | -| inputSplitNum=4, totalFileSize=1798186, scanRanges=4 | -| partition=1/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=3/4 | -``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md deleted file mode 100644 index 16fc1aa20ecbe..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md +++ /dev/null @@ -1,473 +0,0 @@ ---- -{ - "title": "使用 Doris 和 Iceberg", - "language": "zh-CN" -} - ---- - - - -作为一种全新的开放式的数据管理架构,湖仓一体(Data Lakehouse)融合了数据仓库的高性能、实时性以及数据湖的低成本、灵活性等优势,帮助用户更加便捷地满足各种数据处理分析的需求,在企业的大数据体系中已经得到越来越多的应用。 - -在过去多个版本中,Apache Doris 持续加深与数据湖的融合,当前已演进出一套成熟的湖仓一体解决方案。 - -- 自 0.15 版本起,Apache Doris 引入 Hive 和 Iceberg 外部表,尝试在 Apache Iceberg 之上探索与数据湖的能力结合。 -- 自 1.2 版本起,Apache Doris 正式引入 Multi-Catalog 功能,实现了多种数据源的自动元数据映射和数据访问、并对外部数据读取和查询执行等方面做了诸多性能优化,完全具备了构建极速易用 Lakehouse 架构的能力。 -- 在 2.1 版本中,Apache Doris 湖仓一体架构得到全面加强,不仅增强了主流数据湖格式(Hudi、Iceberg、Paimon 等)的读取和写入能力,还引入了多 SQL 方言兼容、可从原有系统无缝切换至 Apache Doris。在数据科学及大规模数据读取场景上,Doris 集成了 Arrow Flight 高速读取接口,使得数据传输效率实现 100 倍的提升。 - -![使用 Doris 和 Iceberg 构建 Lakehouse](/images/lakehouse-architecture-for-doris-and-iceberg.png) - -## Apache Doris & Iceberg - -Apache Iceberg 是一种开源、高性能、高可靠的数据湖表格式,可实现超大规模数据的分析与管理。它支持 Apache Doris 在内的多种主流查询引擎,兼容 HDFS 以及各种对象云存储,具备 ACID、Schema 演进、高级过滤、隐藏分区和分区布局演进等特性,可确保高性能查询以及数据的可靠性及一致性,其时间旅行和版本回滚功能也为数据管理带来较高的灵活性。 - -Apache Doris 对 Iceberg 多项核心特性提供了原生支持: - -- 支持 Hive Metastore、Hadoop、REST、Glue、Google Dataproc Metastore、DLF 等多种 Iceberg Catalog 类型。 -- 原生支持 Iceberg V1/V2 表格式,以及 Position Delete、Equality Delete 文件的读取。 -- 支持通过表函数查询 Iceberg 表快照历史。 -- 支持时间旅行(Time Travel)功能。 -- 原生支持 Iceberg 表引擎。可以通过 Apache Doris 直接创建、管理以及将数据写入到 Iceberg 表。支持完善的分区 Transform 函数,从而提供隐藏分区和分区布局演进等能力。 - -用户可以基于 Apache Doris + Apache Iceberg 快速构建高效的湖仓一体解决方案,以灵活应对实时数据分析与处理的各种需求: - -- 通过 Doris 高性能查询引擎对 Iceberg 表数据和其他数据源进行关联数据分析,构建**统一的联邦数据分析平台**。 -- 通过 Doris 直接管理和构建 Iceberg 表,在 Doris 中完成对数据的清洗、加工并写入到 Iceberg 表,构建**统一的湖仓数据处理平台**。 -- 通过 Iceberg 表引擎,将 Doris 数据共享给其他上下游系统做进一步处理,构建**统一的开放数据存储平台**。 - -未来,Apache Iceberg 将作为 Apache Doris 的原生表引擎之一,提供更加完善的湖格式数据的分析、管理功能。Apache Doris 也将逐步支持包括 Update/Delete/Merge、写回时排序、增量数据读取、元数据管理等 Apache Iceberg 更多高级特性,共同构建统一、高性能、实时的湖仓平台。 - -关于更多说明,请参阅 [Iceberg Catalog](../../../lakehouse/datalake-analytics/iceberg.md) - -## 使用指南 - -本文档主要讲解如何在 Docker 环境下快速搭建 Apache Doris + Apache Iceberg 测试 & 演示环境,并展示各功能的使用操作。 - -本文涉及所有脚本和代码可以从该地址获取:[https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon](https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon) - -### 01 环境准备 - -本文示例采用 Docker Compose 部署,组件及版本号如下: - -| 组件名称 | 版本 | -| --- | --- | -| Apache Doris | 默认 2.1.5,可修改 | -| Apache Iceberg | 1.4.3| -| MinIO | RELEASE.2024-04-29T09-56-05Z| - -### 02 环境部署 - -1. 启动所有组件 - - `bash ./start_all.sh` - -2. 启动后,可以使用如下脚本,登陆 Doris 命令行: - - ```sql - -- login doris - bash ./start_doris_client.sh - ``` - -### 03 创建 Iceberg 表 - -首先登陆 Doris 命令行后,Doris 集群中已经创建了名为 Iceberg 的 Catalog(可通过 `SHOW CATALOGS`/`SHOW CREATE CATALOG iceberg` 查看)。以下为该 Catalog 的创建语句: - -```sql --- 已创建,无需执行 -CREATE CATALOG `iceberg` PROPERTIES ( - "type" = "iceberg", - "iceberg.catalog.type" = "rest", - "warehouse" = "s3://warehouse/", - "uri" = "http://rest:8181", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "http://minio:9000" -); -``` - -在 Iceberg Catalog 创建数据库和 Iceberg 表: - -```sql -mysql> SWITCH iceberg; -Query OK, 0 rows affected (0.00 sec) - -mysql> CREATE DATABASE nyc; -Query OK, 0 rows affected (0.12 sec) - -mysql> CREATE TABLE iceberg.nyc.taxis - ( - vendor_id BIGINT, - trip_id BIGINT, - trip_distance FLOAT, - fare_amount DOUBLE, - store_and_fwd_flag STRING, - ts DATETIME - ) - PARTITION BY LIST (vendor_id, DAY(ts)) () - PROPERTIES ( - "compression-codec" = "zstd", - "write-format" = "parquet" - ); -Query OK, 0 rows affected (0.15 sec) -``` - -### 04 数据写入 - -向 Iceberg 表中插入数据: - -```sql -mysql> INSERT INTO iceberg.nyc.taxis - VALUES - (1, 1000371, 1.8, 15.32, 'N', '2024-01-01 9:15:23'), - (2, 1000372, 2.5, 22.15, 'N', '2024-01-02 12:10:11'), - (2, 1000373, 0.9, 9.01, 'N', '2024-01-01 3:25:15'), - (1, 1000374, 8.4, 42.13, 'Y', '2024-01-03 7:12:33'); -Query OK, 4 rows affected (1.61 sec) -{'status':'COMMITTED', 'txnId':'10085'} -``` - -通过 `CREATE TABLE AS SELECT` 来创建一张 Iceberg 表: - -``` -mysql> CREATE TABLE iceberg.nyc.taxis2 AS SELECT * FROM iceberg.nyc.taxis; -Query OK, 6 rows affected (0.25 sec) -{'status':'COMMITTED', 'txnId':'10088'} -``` - -### 05 数据查询 - -- 简单查询 - - ```sql - mysql> SELECT * FROM iceberg.nyc.taxis; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | - | 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - | 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 4 rows in set (0.37 sec) - - mysql> SELECT * FROM iceberg.nyc.taxis2; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | - | 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - | 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 4 rows in set (0.35 sec) - ``` - -- 分区剪裁 - - ```sql - mysql> SELECT * FROM iceberg.nyc.taxis where vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 1 row in set (0.06 sec) - - mysql> EXPLAIN VERBOSE SELECT * FROM iceberg.nyc.taxis where vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'; - - .... - | 0:VICEBERG_SCAN_NODE(71) - | table: taxis - | predicates: (ts[#5] < '2024-01-02 00:00:00'), (vendor_id[#0] = 2), (ts[#5] >= '2024-01-01 00:00:00') - | inputSplitNum=1, totalFileSize=3539, scanRanges=1 - | partition=1/0 - | backends: - | 10002 - | s3://warehouse/wh/nyc/taxis/data/vendor_id=2/ts_day=2024-01-01/40e6ca404efa4a44-b888f23546d3a69c_5708e229-2f3d-4b68-a66b-44298a9d9815-0.zstd.parquet start: 0 length: 3539 - | cardinality=6, numNodes=1 - | pushdown agg=NONE - | icebergPredicatePushdown= - | ref(name="ts") < 1704153600000000 - | ref(name="vendor_id") == 2 - | ref(name="ts") >= 1704067200000000 - .... - ``` - - 通过 `EXPLAIN VERBOSE` 语句的结果可知,`vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'` 谓词条件,最终只命中一个分区(`partition=1/0`)。 - - 同时也可知,因为在建表时指定了分区 Transform 函数 `DAY(ts)`,原始数据中的的值 `2024-01-01 03:25:15.000000` 会被转换成文件目录中的分区信息 `ts_day=2024-01-01`。 - -### 06 Time Travel - -我们先再次插入几行数据: - -```sql -INSERT INTO iceberg.nyc.taxis VALUES (1, 1000375, 8.8, 55.55, 'Y', '2024-01-01 8:10:22'), (3, 1000376, 7.4, 32.35, 'N', '2024-01-02 1:14:45'); -Query OK, 2 rows affected (0.17 sec) -{'status':'COMMITTED', 'txnId':'10086'} - -mysql> SELECT * FROM iceberg.nyc.taxis; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 3 | 1000376 | 7.4 | 32.35 | N | 2024-01-02 01:14:45.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 1 | 1000375 | 8.8 | 55.55 | Y | 2024-01-01 08:10:22.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -6 rows in set (0.11 sec) -``` - -使用 `iceberg_meta` 表函数查询表的快照信息: - -```sql -mysql> select * from iceberg_meta("table" = "iceberg.nyc.taxis", "query_type" = "snapshots"); -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| committed_at | snapshot_id | parent_id | operation | manifest_list | summary | -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| 2024-07-29 03:38:22 | 8483933166442433486 | -1 | append | s3://warehouse/wh/nyc/taxis/metadata/snap-8483933166442433486-1-5f7b7736-8022-4ba1-9db2-51ae7553be4d.avro | {"added-data-files":"4","added-records":"4","added-files-size":"14156","changed-partition-count":"4","total-records":"4","total-files-size":"14156","total-data-files":"4","total-delete-files":"0","total-position-deletes":"0","total-equality-deletes":"0"} | -| 2024-07-29 03:40:23 | 4726331391239920914 | 8483933166442433486 | append | s3://warehouse/wh/nyc/taxis/metadata/snap-4726331391239920914-1-6aa3d142-6c9c-4553-9c04-08ad4d49a4ea.avro | {"added-data-files":"2","added-records":"2","added-files-size":"7078","changed-partition-count":"2","total-records":"6","total-files-size":"21234","total-data-files":"6","total-delete-files":"0","total-position-deletes":"0","total-equality-deletes":"0"} | -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -2 rows in set (0.07 sec) -``` - -使用 `FOR VERSION AS OF` 语句查询指定快照: - -```sql -mysql> SELECT * FROM iceberg.nyc.taxis FOR VERSION AS OF 8483933166442433486; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.05 sec) - -mysql> SELECT * FROM iceberg.nyc.taxis FOR VERSION AS OF 4726331391239920914; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000375 | 8.8 | 55.55 | Y | 2024-01-01 08:10:22.000000 | -| 3 | 1000376 | 7.4 | 32.35 | N | 2024-01-02 01:14:45.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -6 rows in set (0.04 sec) -``` - -使用 `FOR TIME AS OF` 语句查询指定快照: - -```sql -mysql> SELECT * FROM iceberg.nyc.taxis FOR TIME AS OF "2024-07-29 03:38:23"; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.04 sec) - -mysql> SELECT * FROM iceberg.nyc.taxis FOR TIME AS OF "2024-07-29 03:40:22"; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.05 sec) -``` - -### 07 与 PyIceberg 交互 - -> 请使用 Doris 2.1.8/3.0.4 以上版本。 - -加载 Iceberg 表: - -```python -from pyiceberg.catalog import load_catalog - -catalog = load_catalog( - "iceberg", - **{ - "warehouse" = "warehouse", - "uri" = "http://rest:8181", - "s3.access-key-id" = "admin", - "s3.secret-access-key" = "password", - "s3.endpoint" = "http://minio:9000" - }, -) -table = catalog.load_table("nyc.taxis") -``` - -读取为 Arrow Table: - -```python -print(table.scan().to_arrow()) - -pyarrow.Table -vendor_id: int64 -trip_id: int64 -trip_distance: float -fare_amount: double -store_and_fwd_flag: large_string -ts: timestamp[us] ----- -vendor_id: [[1],[1],[2],[2]] -trip_id: [[1000371],[1000374],[1000373],[1000372]] -trip_distance: [[1.8],[8.4],[0.9],[2.5]] -fare_amount: [[15.32],[42.13],[9.01],[22.15]] -store_and_fwd_flag: [["N"],["Y"],["N"],["N"]] -ts: [[2024-01-01 09:15:23.000000],[2024-01-03 07:12:33.000000],[2024-01-01 03:25:15.000000],[2024-01-02 12:10:11.000000]] -``` - -读取为 Pandas DataFrame: - -```python -print(table.scan().to_pandas()) - -vendor_id trip_id trip_distance fare_amount store_and_fwd_flag ts -0 1 1000371 1.8 15.32 N 2024-01-01 09:15:23 -1 1 1000374 8.4 42.13 Y 2024-01-03 07:12:33 -2 2 1000373 0.9 9.01 N 2024-01-01 03:25:15 -3 2 1000372 2.5 22.15 N 2024-01-02 12:10:11 -``` - -读取为 Polars DataFrame: - -```python -import polars as pl - -print(pl.scan_iceberg(table).collect()) - -shape: (4, 6) -┌───────────┬─────────┬───────────────┬─────────────┬────────────────────┬─────────────────────┐ -│ vendor_id ┆ trip_id ┆ trip_distance ┆ fare_amount ┆ store_and_fwd_flag ┆ ts │ -│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ -│ i64 ┆ i64 ┆ f32 ┆ f64 ┆ str ┆ datetime[μs] │ -╞═══════════╪═════════╪═══════════════╪═════════════╪════════════════════╪═════════════════════╡ -│ 1 ┆ 1000371 ┆ 1.8 ┆ 15.32 ┆ N ┆ 2024-01-01 09:15:23 │ -│ 1 ┆ 1000374 ┆ 8.4 ┆ 42.13 ┆ Y ┆ 2024-01-03 07:12:33 │ -│ 2 ┆ 1000373 ┆ 0.9 ┆ 9.01 ┆ N ┆ 2024-01-01 03:25:15 │ -│ 2 ┆ 1000372 ┆ 2.5 ┆ 22.15 ┆ N ┆ 2024-01-02 12:10:11 │ -└───────────┴─────────┴───────────────┴─────────────┴────────────────────┴─────────────────────┘ -``` - -> 通过 pyiceberg 写入 iceberg 数据,请参阅[步骤](#通过-pyiceberg-写入数据) - -### 08 附录 - -#### 通过 PyIceberg 写入数据 - -加载 Iceberg 表: - -```python -from pyiceberg.catalog import load_catalog - -catalog = load_catalog( - "iceberg", - **{ - "warehouse" = "warehouse", - "uri" = "http://rest:8181", - "s3.access-key-id" = "admin", - "s3.secret-access-key" = "password", - "s3.endpoint" = "http://minio:9000" - }, -) -table = catalog.load_table("nyc.taxis") -``` - -Arrow Table 写入 Iceberg: - -```python -import pyarrow as pa - -df = pa.Table.from_pydict( - { - "vendor_id": pa.array([1, 2, 2, 1], pa.int64()), - "trip_id": pa.array([1000371, 1000372, 1000373, 1000374], pa.int64()), - "trip_distance": pa.array([1.8, 2.5, 0.9, 8.4], pa.float32()), - "fare_amount": pa.array([15.32, 22.15, 9.01, 42.13], pa.float64()), - "store_and_fwd_flag": pa.array(["N", "N", "N", "Y"], pa.string()), - "ts": pa.compute.strptime( - ["2024-01-01 9:15:23", "2024-01-02 12:10:11", "2024-01-01 3:25:15", "2024-01-03 7:12:33"], - "%Y-%m-%d %H:%M:%S", - "us", - ), - } -) -table.append(df) -``` - -Pandas DataFrame 写入 Iceberg: - -```python -import pyarrow as pa -import pandas as pd - -df = pd.DataFrame( - { - "vendor_id": pd.Series([1, 2, 2, 1]).astype("int64[pyarrow]"), - "trip_id": pd.Series([1000371, 1000372, 1000373, 1000374]).astype("int64[pyarrow]"), - "trip_distance": pd.Series([1.8, 2.5, 0.9, 8.4]).astype("float32[pyarrow]"), - "fare_amount": pd.Series([15.32, 22.15, 9.01, 42.13]).astype("float64[pyarrow]"), - "store_and_fwd_flag": pd.Series(["N", "N", "N", "Y"]).astype("string[pyarrow]"), - "ts": pd.Series(["2024-01-01 9:15:23", "2024-01-02 12:10:11", "2024-01-01 3:25:15", "2024-01-03 7:12:33"]).astype("timestamp[us][pyarrow]"), - } -) -table.append(pa.Table.from_pandas(df)) -``` - -Polars DataFrame 写入 Iceberg: - -```python -import polars as pl - -df = pl.DataFrame( - { - "vendor_id": [1, 2, 2, 1], - "trip_id": [1000371, 1000372, 1000373, 1000374], - "trip_distance": [1.8, 2.5, 0.9, 8.4], - "fare_amount": [15.32, 22.15, 9.01, 42.13], - "store_and_fwd_flag": ["N", "N", "N", "Y"], - "ts": ["2024-01-01 9:15:23", "2024-01-02 12:10:11", "2024-01-01 3:25:15", "2024-01-03 7:12:33"], - }, - { - "vendor_id": pl.Int64, - "trip_id": pl.Int64, - "trip_distance": pl.Float32, - "fare_amount": pl.Float64, - "store_and_fwd_flag": pl.String, - "ts": pl.String, - }, -).with_columns(pl.col("ts").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")) -table.append(df.to_arrow()) -``` - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md deleted file mode 100644 index 8dfba38dac237..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md +++ /dev/null @@ -1,349 +0,0 @@ ---- -{ - "title": "使用 Doris 和 LakeSoul", - "language": "zh-CN" -} - ---- - - - -作为一种全新的开放式的数据管理架构,湖仓一体(Data Lakehouse)融合了数据仓库的高性能、实时性以及数据湖的低成本、灵活性等优势,帮助用户更加便捷地满足各种数据处理分析的需求,在企业的大数据体系中已经得到越来越多的应用。 - -在过去多个版本中,Apache Doris 持续加深与数据湖的融合,当前已演进出一套成熟的湖仓一体解决方案。 - -- 自 0.15 版本起,Apache Doris 引入 Hive 和 Iceberg 外部表,尝试在 Apache Iceberg 之上探索与数据湖的能力结合。 -- 自 1.2 版本起,Apache Doris 正式引入 Multi-Catalog 功能,实现了多种数据源的自动元数据映射和数据访问、并对外部数据读取和查询执行等方面做了诸多性能优化,完全具备了构建极速易用 Lakehouse 架构的能力。 -- 在 2.1 版本中,Apache Doris 湖仓一体架构得到全面加强,不仅增强了主流数据湖格式(Hudi、Iceberg、Paimon 等)的读取和写入能力,还引入了多 SQL 方言兼容、可从原有系统无缝切换至 Apache Doris。在数据科学及大规模数据读取场景上,Doris 集成了 Arrow Flight 高速读取接口,使得数据传输效率实现 100 倍的提升。 - -![使用 Doris 和 LakeSoul 构建 Lakehouse](/images/lakehouse-architecture-for-doris-and-lakesoul.png) - -## Apache Doris & LakeSoul - -LakeSoul 是由数元灵开发的云原生湖仓框架,并在 2023 年 5 月捐赠给了 Linux 基金会 AI & Data 基金会。它以元数据管理的高可扩展性、ACID 事务、高效灵活的 upsert 操作、模式演变和批流集成处理为特点。 - -借助 Apache Doris 的高性能查询引擎和 LakeSoul 的高效数据管理,用户可以实现: - -- 实时数据入湖:利用 LakeSoul 的架构,数据可以以高效率和低延迟入湖,支持包括聚合、去重和部分列更新在内的各种数据更新能力。 -- 高性能数据处理和分析:LakeSoul 的批流集成处理和模式演变等能力可以与 Doris 的强大查询引擎无缝集成,实现湖数据的快速查询和分析响应。 -未来,Apache Doris 将逐步支持 LakeSoul 的更多高级功能,如 CDC 流同步和自动模式演变,共同构建统一的、高性能的、实时的湖仓平台。 - -本文将解释如何快速搭建 Apache Doris + LakeSoul 测试和演示环境,并演示各种功能的使用方法,展示在湖仓架构中使用两个系统集成和优势。 - -关于更多说明,请参阅 [LakeSoul Catalog](../../../lakehouse/datalake-analytics/lakesoul) - -## 使用指南 - -本文涉及所有脚本和代码可以从该地址获取:[https://github.com/apache/doris/tree/master/samples/datalake/lakesoul](https://github.com/apache/doris/tree/master/samples/datalake/lakesoul) - -### 01 环境准备 - -本文示例采用 Docker Compose 部署,组件及版本号如下: - -| 组件名称 | 版本 | -| --- | --- | -| Apache Doris | 默认 3.0.2| -| LakeSoul | 2.6.1 | -| Postgres | 14.5 | -| Apache Spark | 3.3.1 | -| Apache Flink | 1.17 | -| MinIO | RELEASE.2024-04-29T09-56-05Z | - - - -### 02 环境部署 - -1. 启动所有组件 - - - ``` - bash ./start_all.sh - ``` - -2. 启动后,可以使用以下脚本登录到 Doris 命令行: - - ``` - -- login doris - bash ./start_doris_client.sh - ``` - - -### 03 数据查询 - -如下所示,在 Doris 集群中已经创建了一个名为 lakesoul 的 Catalog(可使用 SHOW CATALOGS 查看)。以下是该 Catalog 的创建语句: - -```sql - -- Already created - CREATE CATALOG `lakesoul` PROPERTIES ( - 'type'='lakesoul', - 'lakesoul.pg.username'='lakesoul_test', - 'lakesoul.pg.password'='lakesoul_test', - 'lakesoul.pg.url'='jdbc:postgresql://lakesoul-meta-pg:5432/lakesoul_test?stringtype=unspecified', - 'minio.endpoint'='http://minio:9000', - 'minio.access_key'='admin', - 'minio.secret_key'='password' - ); - - ``` - LakeSoul 表 `lakesoul.tpch.customer` 已加载到 Doris 中。在 Doris 中查询数据。 - -- 查询数据 - ```sql - Doris> use `lakesoul`.`tpch`; - Database changed - - Doris> show tables; - +---------------------+ - | Tables_in_tpch | - +---------------------+ - | customer_from_spark | - +---------------------+ - 1 row in set (0.00 sec) - - Doris> select * from customer_from_spark where c_nationkey = 1 order by c_custkey limit 4; - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - | 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | - | 14 | Customer#000000014 | KXkletMlL2JQEA | 1 | 11-845-129-3851 | 5266.30 | FURNITURE | , ironic packages across the unus | - | 30 | Customer#000000030 | nJDsELGAavU63Jl0c5NKsKfL8rIJQQkQnYL2QJY | 1 | 11-764-165-5076 | 9321.01 | BUILDING | lithely final requests. furiously unusual account | - | 59 | Customer#000000059 | zLOCP0wh92OtBihgspOGl4 | 1 | 11-355-584-3112 | 3458.60 | MACHINERY | ously final packages haggle blithely after the express deposits. furiou | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - 4 rows in set (3.14 sec) - - Doris> select * from customer_from_spark where c_nationkey = 1 order by c_custkey desc limit 4; - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - | 14983 | Customer#000014983 | ERN3vq5Fvt4DL | 1 | 11-424-279-1846 | 841.22 | AUTOMOBILE | furiously slyly special foxes. express theodolites cajole carefully. special dinos haggle pinto | - | 14968 | Customer#000014968 | ,sykKTZBzVFl7ito1750v2TRYwmkRl2nvqGHwmx | 1 | 11-669-222-9657 | 6106.77 | HOUSEHOLD | ts above the furiously even deposits haggle across | - | 14961 | Customer#000014961 | JEIORcsBp6RpLYH 9gNdDyWJ | 1 | 11-490-251-5554 | 4006.35 | HOUSEHOLD | quests detect carefully final platelets! quickly final frays haggle slyly blithely final acc | - | 14940 | Customer#000014940 | bNoyCxPuqSwPLjbqjEUNGN d0mSP | 1 | 11-242-677-1085 | 8829.48 | HOUSEHOLD | ver the quickly express braids. regular dependencies haggle fluffily quickly i | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - 4 rows in set (0.10 sec) - ``` - -- 分区裁剪 - Doris 可以对 LakeSoul 执行分区裁剪,并通过原生读取加速查询过程。我们可以通过 `explain verbose` 来检查这一点。 - - - ```sql - Doris> explain verbose select * from customer_from_spark where c_nationkey < 3; - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | Explain String(Old Planner) | - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | PLAN FRAGMENT 0 | - | OUTPUT EXPRS: | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_custkey` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_name` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_address` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_nationkey` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_phone` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_acctbal` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_mktsegment` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_comment` | - | PARTITION: UNPARTITIONED | - | | - | HAS_COLO_PLAN_NODE: false | - | | - | VRESULT SINK | - | MYSQL_PROTOCAL | - | | - | 1:VEXCHANGE | - | offset: 0 | - | tuple ids: 0 | - | | - | PLAN FRAGMENT 1 | - | | - | PARTITION: RANDOM | - | | - | HAS_COLO_PLAN_NODE: false | - | | - | STREAM DATA SINK | - | EXCHANGE ID: 01 | - | UNPARTITIONED | - | | - | 0:VplanNodeName | - | table: customer_from_spark | - | predicates: (`c_nationkey` < 3) | - | inputSplitNum=12, totalFileSize=0, scanRanges=12 | - | partition=0/0 | - | backends: | - | 10002 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00000-0568c817-d6bc-4fa1-bb9e-b311069b131c_00000.c000.parquet start: 0 length: 0 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00001-d99a8fe6-61ab-4285-94da-2f84f8746a8a_00001.c000.parquet start: 0 length: 0 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00002-8a8e396f-685f-4b0f-87fa-e2a3fe5be87e_00002.c000.parquet start: 0 length: 0 | - | ... other 8 files ... | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=0/part-00003-d5b598cd-5bed-412c-a26f-bb4bc9c937bc_00003.c000.parquet start: 0 length: 0 | - | numNodes=1 | - | pushdown agg=NONE | - | tuple ids: 0 | - | | - | Tuples: | - | TupleDescriptor{id=0, tbl=customer_from_spark} | - | SlotDescriptor{id=0, col=c_custkey, colUniqueId=0, type=int, nullable=false, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=1, col=c_name, colUniqueId=1, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=2, col=c_address, colUniqueId=2, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=3, col=c_nationkey, colUniqueId=3, type=int, nullable=false, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=4, col=c_phone, colUniqueId=4, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=5, col=c_acctbal, colUniqueId=5, type=decimalv3(15,2), nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=6, col=c_mktsegment, colUniqueId=6, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=7, col=c_comment, colUniqueId=7, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - 57 rows in set (0.03 sec) - - ``` - - - 通过检查 `EXPLAIN VERBOSE` 语句的结果,可以看到谓词条件 `c_nationkey < 3` 最终只命中一个分区(partition=0/0)。 - -### 04 CDC 表支持 - -启动 Flink CDC 作业以同步 MySQL 表。MySQL 表在启动 `start_all.sh` 时已经被加载了。 - - -``` -bash start_flink_cdc_job.sh -``` - -```sql -Start flink-cdc job... -SLF4J: Class path contains multiple SLF4J bindings. -SLF4J: Found binding in [jar:file:/opt/flink/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class] -SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class] -SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. -SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory] -Loading class `com.mysql.jdbc.Driver'. This is deprecated. The new driver class is `com.mysql.cj.jdbc.Driver'. The driver is automatically registered via the SPI and manual loading of the driver class is generally unnecessary. -Job has been submitted with JobID d1b3641dcd1ad85c6b373d49b1867e68 - -``` - - -Flink CDC 作业将启动。我们可以通过重新创建 LakeSoul Catalog 在 `doris client` 中检查启动过程。Flink CDC 作业启动后,我们可以在 `doris client` 中看到正在同步的 LakeSoul CDC 表。 - -```sql -Doris> show tables; -+---------------------+ -| Tables_in_tpch | -+---------------------+ -| customer_from_spark | -+---------------------+ -2 rows in set (0.00 sec) - - -Doris> drop catalog if exists lakesoul; -Query OK, 0 rows affected (0.00 sec) - -Doris> create catalog `lakesoul` properties ('type'='lakesoul', 'lakesoul.pg.username'='lakesoul_test', 'lakesoul.pg.password'='lakesoul_test', 'lakesoul.pg.url'='jdbc:postgresql://lakesoul-meta-pg:5432/lakesoul_test?stringtype=unspecified', 'minio.endpoint'='http://minio:9000', 'minio.access_key'='admin', 'minio.secret_key'='password'); -Query OK, 0 rows affected (0.01 sec) - -Doris> show tables; -+---------------------+ -| Tables_in_tpch | -+---------------------+ -| customer | -| customer_from_spark | -+---------------------+ -2 rows in set (0.00 sec) - -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -9 rows in set (1.09 sec) - -``` - -进入 `mysql client` 并尝试修改数据。 - - -``` -bash start_mysql_client.sh -``` - -尝试从 `mysql client` 更新行。 - - -```sql -mysql> update customer set c_acctbal=2211.26 where c_custkey=1; -Query OK, 1 row affected (0.01 sec) -Rows matched: 1 Changed: 1 Warnings: 0 -``` - -回到 `doris client` 并检查数据变化。 - - -```sql -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 2211.26 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -9 rows in set (0.11 sec) - -``` - -尝试从 `mysql client` 删除行。 - - -```sql -mysql> delete from customer where c_custkey = 2; -Query OK, 1 row affected (0.01 sec) -``` - -回到 `doris client` 并检查数据变化。 - - -```sql -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 2211.26 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -8 rows in set (0.11 sec) - -``` diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-paimon.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-paimon.md deleted file mode 100644 index 5c7b7b98b5b6d..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-paimon.md +++ /dev/null @@ -1,269 +0,0 @@ ---- -{ - "title": "使用 Doris 和 Paimon", - "language": "zh-CN" -} - ---- - - - -作为一种全新的开放式的数据管理架构,湖仓一体(Data Lakehouse)融合了数据仓库的高性能、实时性以及数据湖的低成本、灵活性等优势,帮助用户更加便捷地满足各种数据处理分析的需求,在企业的大数据体系中已经得到越来越多的应用。 - -在过去多个版本中,Apache Doris 持续加深与数据湖的融合,当前已演进出一套成熟的湖仓一体解决方案。 - -- 自 0.15 版本起,Apache Doris 引入 Hive 和 Iceberg 外部表,尝试在 Apache Iceberg 之上探索与数据湖的能力结合。 -- 自 1.2 版本起,Apache Doris 正式引入 Multi-Catalog 功能,实现了多种数据源的自动元数据映射和数据访问、并对外部数据读取和查询执行等方面做了诸多性能优化,完全具备了构建极速易用 Lakehouse 架构的能力。 -- 在 2.1 版本中,Apache Doris 湖仓一体架构得到全面加强,不仅增强了主流数据湖格式(Hudi、Iceberg、Paimon 等)的读取和写入能力,还引入了多 SQL 方言兼容、可从原有系统无缝切换至 Apache Doris。在数据科学及大规模数据读取场景上,Doris 集成了 Arrow Flight 高速读取接口,使得数据传输效率实现 100 倍的提升。 - -![使用 Doris 和 Paimon 构建 Lakehouse](/images/lakehouse-architecture-for-doris-and-paimon.png) - -## Apache Doris & Paimon - -Apache Paimon 是一种数据湖格式,并创新性地将数据湖格式和 LSM 结构的优势相结合,成功将高效的实时流更新能力引入数据湖架构中,这使得 Paimon 能够实现数据的高效管理和实时分析,为构建实时湖仓架构提供了强大的支撑。 - -为了充分发挥 Paimon 的能力,提高对 Paimon 数据的查询效率,Apache Doris 对 Paimon 的多项最新特性提供了原生支持: - -- 支持 Hive Metastore、FileSystem 等多种类型的 Paimon Catalog。 -- 原生支持 Paimon 0.6 版本发布的 Primary Key Table Read Optimized 功能。 -- 原生支持 Paimon 0.8 版本发布的 Primary Key Table Deletion Vector 功能。 - -基于 Apache Doris 的高性能查询引擎和 Apache Paimon 高效的实时流更新能力,用户可以实现: - -- 数据实时入湖:借助 Paimon 的 LSM-Tree 模型,数据入湖的时效性可以降低到分钟级;同时,Paimon 支持包括聚合、去重、部分列更新在内的多种数据更新能力,使得数据流动更加灵活高效。 -- 高性能数据处理分析:Paimon 所提供的 Append Only Table、Read Optimized、Deletion Vector 等技术,可与 Doris 强大的查询引擎对接,实现湖上数据的快速查询及分析响应。 - -未来 Apache Doris 将会逐步支持包括 Time Travel、增量数据读取在内的 Apache Paimon 更多高级特性,共同构建统一、高性能、实时的湖仓平台。 - -本文将会再 Docker 环境中,为读者讲解如何快速搭建 Apache Doris + Apache Paimon 测试 & 演示环境,并展示各功能的使用操作。 - -关于更多说明,请参阅 [Paimon Catalog](../../../lakehouse/datalake-analytics/paimon.md) - -## 使用指南 - -本文涉及所有脚本和代码可以从该地址获取:[https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon](https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon) - -### 01 环境准备 - -本文示例采用 Docker Compose 部署,组件及版本号如下: - -| 组件名称 | 版本 | -| --- | --- | -| Apache Doris | 默认 2.1.5,可修改 | -| Apache Paimon | 0.8| -| Apache Flink | 1.18| -| MinIO | RELEASE.2024-04-29T09-56-05Z| - -### 02 环境部署 - -1. 启动所有组件 - - `bash ./start_all.sh` - -2. 启动后,可以使用如下脚本,登陆 Flink 命令行或 Doris 命令行: - - ```sql - -- login flink - bash ./start_flink_client.sh - - -- login doris - bash ./start_doris_client.sh - ``` - -### 03 数据准备 - -首先登陆 Flink 命令行后,可以看到一张预构建的表。表中已经包含一些数据,我们可以通过 Flink SQL 进行查看。 - -```sql -Flink SQL> use paimon.db_paimon; -[INFO] Execute statement succeed. - -Flink SQL> show tables; -+------------+ -| table name | -+------------+ -| customer | -+------------+ -1 row in set - -Flink SQL> show create table customer; -+------------------------------------------------------------------------+ -| result | -+------------------------------------------------------------------------+ -| CREATE TABLE `paimon`.`db_paimon`.`customer` ( - `c_custkey` INT NOT NULL, - `c_name` VARCHAR(25), - `c_address` VARCHAR(40), - `c_nationkey` INT NOT NULL, - `c_phone` CHAR(15), - `c_acctbal` DECIMAL(12, 2), - `c_mktsegment` CHAR(10), - `c_comment` VARCHAR(117), - CONSTRAINT `PK_c_custkey_c_nationkey` PRIMARY KEY (`c_custkey`, `c_nationkey`) NOT ENFORCED -) PARTITIONED BY (`c_nationkey`) -WITH ( - 'bucket' = '1', - 'path' = 's3://warehouse/wh/db_paimon.db/customer', - 'deletion-vectors.enabled' = 'true' -) - | -+-------------------------------------------------------------------------+ -1 row in set - -Flink SQL> desc customer; -+--------------+----------------+-------+-----------------------------+--------+-----------+ -| name | type | null | key | extras | watermark | -+--------------+----------------+-------+-----------------------------+--------+-----------+ -| c_custkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | -| c_name | VARCHAR(25) | TRUE | | | | -| c_address | VARCHAR(40) | TRUE | | | | -| c_nationkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | -| c_phone | CHAR(15) | TRUE | | | | -| c_acctbal | DECIMAL(12, 2) | TRUE | | | | -| c_mktsegment | CHAR(10) | TRUE | | | | -| c_comment | VARCHAR(117) | TRUE | | | | -+--------------+----------------+-------+-----------------------------+--------+-----------+ -8 rows in set - -Flink SQL> select * from customer order by c_custkey limit 4; -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platel... | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic... | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic,... | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tl... | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious ... | -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -4 rows in set -``` - -### 04 数据查询 - -如下所示,Doris 集群中已经创建了名为 `paimon` 的 Catalog(可通过 SHOW CATALOGS 查看)。以下为该 Catalog 的创建语句: - -```sql --- 已创建,无需执行 -CREATE CATALOG `paimon` PROPERTIES ( - "type" = "paimon", - "warehouse" = "s3://warehouse/wh/", - "s3.endpoint"="http://minio:9000", - "s3.access_key"="admin", - "s3.secret_key"="password", - "s3.region"="us-east-1" -); -``` - -你可登录到 Doris 中查询 Paimon 的数据: - -```sql -mysql> use paimon.db_paimon; -Reading table information for completion of table and column names -You can turn off this feature to get a quicker startup with -A - -Database changed -mysql> show tables; -+---------------------+ -| Tables_in_db_paimon | -+---------------------+ -| customer | -+---------------------+ -1 row in set (0.00 sec) - -mysql> select * from customer order by c_custkey limit 4; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -4 rows in set (1.89 sec) -``` - -### 05 读取增量数据 - -我们可以通过 Flink SQL 更新 Paimon 表中的数据: - -```sql -Flink SQL> update customer set c_address='c_address_update' where c_nationkey = 1; -[INFO] Submitting SQL update statement to the cluster... -[INFO] SQL update statement has been successfully submitted to the cluster: -Job ID: ff838b7b778a94396b332b0d93c8f7ac -``` - -等 Flink SQL 执行完毕后,在 Doris 中可直接查看到最新的数据: - -```sql -mysql> select * from customer where c_nationkey=1 limit 2; -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| 3 | Customer#000000003 | c_address_update | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 513 | Customer#000000513 | c_address_update | 1 | 11-861-303-6887 | 955.37 | HOUSEHOLD | press along the quickly regular instructions. regular requests against the carefully ironic s | -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -2 rows in set (0.19 sec) -``` - -### Benchmark - -我们在 Paimon(0.8)版本的 TPCDS 1000 数据集上进行了简单的测试,分别使用了 Apache Doris 2.1.5 版本和 Trino 422 版本,均开启 Primary Key Table Read Optimized 功能。 - -![](/images/quick-start/lakehouse-paimon-benchmark.PNG) - -从测试结果可以看到,Doris 在标准静态测试集上的平均查询性能是 Trino 的 3~5 倍。后续我们将针对 Deletion Vector 进行优化,进一步提升真实业务场景下的查询效率。 - -## 查询优化 - -对于基线数据来说,Apache Paimon 在 0.6 版本中引入 Primary Key Table Read Optimized 功能后,使得查询引擎可以直接访问底层的 Parquet/ORC 文件,大幅提升了基线数据的读取效率。对于尚未合并的增量数据(INSERT、UPDATE 或 DELETE 所产生的数据增量)来说,可以通过 Merge-on-Read 的方式进行读取。此外,Paimon 在 0.8 版本中还引入的 Deletion Vector 功能,能够进一步提升查询引擎对增量数据的读取效率。 -Apache Doris 支持通过原生的 Reader 读取 Deletion Vector 并进行 Merge on Read,我们通过 Doris 的 EXPLAIN 语句,来演示在一个查询中,基线数据和增量数据的查询方式。 - -```sql -mysql> explain verbose select * from customer where c_nationkey < 3; -+------------------------------------------------------------------------------------------------------------------------------------------------+ -| Explain String(Nereids Planner) | -+------------------------------------------------------------------------------------------------------------------------------------------------+ -| ............... | -| | -| 0:VPAIMON_SCAN_NODE(68) | -| table: customer | -| predicates: (c_nationkey[#3] < 3) | -| inputSplitNum=4, totalFileSize=238324, scanRanges=4 | -| partition=3/0 | -| backends: | -| 10002 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-15cee5b7-1bd7-42ca-9314-56d92c62c03b-0.orc start: 0 length: 66600 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-5d50255a-2215-4010-b976-d5dc656f3444-0.orc start: 0 length: 44501 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=2/bucket-0/data-e98fb7ef-ec2b-4ad5-a496-713cb9481d56-0.orc start: 0 length: 64059 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=0/bucket-0/data-431be05d-50fa-401f-9680-d646757d0f95-0.orc start: 0 length: 63164 | -| cardinality=18751, numNodes=1 | -| pushdown agg=NONE | -| paimonNativeReadSplits=4/4 | -| PaimonSplitStats: | -| SplitStat [type=NATIVE, rowCount=1542, rawFileConvertable=true, hasDeletionVector=true] | -| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | -| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | -| tuple ids: 0 -| ............... | | -+------------------------------------------------------------------------------------------------------------------------------------------------+ -67 rows in set (0.23 sec) -``` - -可以看到,对于刚才通过 Flink SQL 更新的表,包含 4 个分片,并且全部分片都可以通过 Native Reader 进行访问(`paimonNativeReadSplits=4/4`)。并且第一个分片的`hasDeletionVector`的属性为`true`,表示该分片有对应的 Deletion Vector,读取时会根据 Deletion Vector 进行数据过滤。 \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/what-is-apache-doris.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/what-is-apache-doris.md index b47121e05b22b..6a8d89c14e980 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/what-is-apache-doris.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/what-is-apache-doris.md @@ -104,6 +104,6 @@ Apache Doris 也支持**强一致的物化视图**,物化视图的更新和选 ![Doris 查询引擎是向量化](/images/getting-started/apache-doris-query-engine-2.png) -**Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术,**可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 +**Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术,** 可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 在**优化器**方面,Apache Doris 使用 CBO 和 RBO 结合的优化策略,RBO 支持常量折叠、子查询改写、谓词下推等,CBO 支持 Join Reorder。目前 CBO 还在持续优化中,主要集中在更加精准的统计信息收集和推导,更加精准的代价模型预估等方面。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/gettingStarted/tutorials/log-storage-analysis.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/log-storage-analysis.md similarity index 99% rename from i18n/zh-CN/docusaurus-plugin-content-docs/current/gettingStarted/tutorials/log-storage-analysis.md rename to i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/log-storage-analysis.md index 9669a0ea06df1..e5c7adcac3165 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/gettingStarted/tutorials/log-storage-analysis.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "构建日志存储与分析平台", + "title": "日志存储与分析", "language": "zh-CN" } --- @@ -218,13 +218,13 @@ Apache Doris 对 Flexible Schema 的日志数据提供了几个方面的支持 更多关于分区分桶的信息,可参考 [数据划分](../../table-design/data-partitioning/basic-concepts)。 **配置压缩参数** -- 使用 zstd 压缩算法(`"compression" = "zstd"`), 提高数据压缩率。 +- 使用 zstd 压缩算法 (`"compression" = "zstd"`), 提高数据压缩率。 **配置 Compaction 参数** 按照以下说明配置 Compaction 参数: -- 使用 time_series 策略(`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 +- 使用 time_series 策略 (`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 **建立和配置索引参数** diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1.json b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1.json index d4539078634d6..5764c877c2349 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1.json +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1.json @@ -59,9 +59,9 @@ "message": "部署存算分离集群", "description": "The label for category Deploying on Kubernetes in sidebar docs" }, - "sidebar.docs.category.Deployment on Cloud": { + "sidebar.docs.category.Deploying on Cloud": { "message": "云上部署集群", - "description": "The label for category Deployment on Cloud in sidebar docs" + "description": "The label for category Deploying on Cloud in sidebar docs" }, "sidebar.docs.category.Database Connection": { "message": "数据库连接", @@ -668,7 +668,7 @@ "description": "The label for category Cross Cluster Replication in sidebar docs" }, "sidebar.docs.category.Tiered Storage": { - "message": "分层存储", + "message": "冷热数据分层", "description": "The label for category Tiered Storage in sidebar docs" }, "sidebar.docs.category.Business Continuity & Data Recovery": { diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/data-operate/delete/delete-overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/data-operate/delete/delete-overview.md index 74b6a6049a5b6..395783dc90d7a 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/data-operate/delete/delete-overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/data-operate/delete/delete-overview.md @@ -28,7 +28,7 @@ under the License. ## 删除的实现机制 -Doris 的删除操作采用**标记删除(Logical Deletion)**的方式,而不是直接物理删除数据。以下是其核心实现机制: +Doris 的删除操作采用**标记删除(Logical Deletion)** 的方式,而不是直接物理删除数据。以下是其核心实现机制: 1. **标记删除**。删除操作不会直接从存储中移除数据,而是为目标数据添加一条删除标记。标记删除主要有两种实现方式:delete 谓词和 delete sign。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/data-operate/update/unique-update-concurrent-control.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/data-operate/update/unique-update-concurrent-control.md index 7fb631fc893d1..f46d3170707e8 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/data-operate/update/unique-update-concurrent-control.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/data-operate/update/unique-update-concurrent-control.md @@ -28,7 +28,7 @@ under the License. Doris 采用多版本并发控制机制(MVCC - Multi-Version Concurrency Control)来管理并发更新。每次数据写入操作均会分配一个写入事务,该事务确保数据写入的原子性(即写入操作要么完全成功,要么完全失败)。在写入事务提交时,系统会为其分配一个版本号。当用户使用 Unique Key 模型并多次导入数据时,如果存在重复主键,Doris 会根据版本号确定覆盖顺序:版本号较高的数据会覆盖版本号较低的数据。 -在某些场景中,用户可能需要通过在建表语句中指定 sequence 列来灵活调整数据的生效顺序。例如,当通过多线程并发同步数据到 Doris 时,不同线程的数据可能会乱序到达。这种情况下,可能出现旧数据因较晚到达而错误覆盖新数据的情况。为解决这一问题,用户可以为旧数据指定较低的 sequence 值,为新数据指定较高的 sequence 值,从而让 Doris 根据用户提供的 sequence值来正确确定数据的更新顺序。 +在某些场景中,用户可能需要通过在建表语句中指定 sequence 列来灵活调整数据的生效顺序。例如,当通过多线程并发同步数据到 Doris 时,不同线程的数据可能会乱序到达。这种情况下,可能出现旧数据因较晚到达而错误覆盖新数据的情况。为解决这一问题,用户可以为旧数据指定较低的 sequence 值,为新数据指定较高的 sequence 值,从而让 Doris 根据用户提供的 sequence 值来正确确定数据的更新顺序。 此外,`UPDATE` 语句与通过导入实现更新在底层机制上存在较大差异。`UPDATE` 操作涉及两个步骤:从数据库中读取待更新的数据,以及写入更新后的数据。默认情况下,`UPDATE` 语句通过表级锁提供了 Serializable 隔离级别的事务能力,即多个 `UPDATE` 操作只能串行执行。用户也可以通过调整配置绕过这一限制,具体方法请参阅以下章节的详细说明。 @@ -72,7 +72,7 @@ sequence 列目前只支持 Unique 模型。 **Sequence 列建表时有两种方式,一种是建表时设置`sequence_col`属性,一种是建表时设置`sequence_type`属性。** -**1. 设置****`sequence_col`(推荐)** +**1. 设置 `sequence_col`(推荐)** 创建 Unique 表时,指定 sequence 列到表中其他 column 的映射 @@ -86,7 +86,7 @@ sequence_col 用来指定 sequence 列到表中某一列的映射,该列可以 导入方式和没有 sequence 列时一样,使用相对比较简单,推荐使用。 -**2. 设置****`sequence_type`** +**2. 设置 `sequence_type`** 创建 Unique 表时,指定 sequence 列类型 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/db-connect/database-connect.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/db-connect/database-connect.md index d781e92299414..47a2ed6066717 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/db-connect/database-connect.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/db-connect/database-connect.md @@ -1,6 +1,6 @@ --- { - "title": "数据库连接", + "title": "通过 MySQL 协议连接", "language": "zh-CN" } --- diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/gettingStarted/what-is-apache-doris.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/gettingStarted/what-is-apache-doris.md index f9bf076980770..13da15b9e52bd 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/gettingStarted/what-is-apache-doris.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/gettingStarted/what-is-apache-doris.md @@ -128,7 +128,7 @@ Apache Doris 查询引擎是向量化的查询引擎,所有的内存结构能 ![Doris 查询引擎是向量化](/images/getting-started/apache-doris-query-engine-2.png) -Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术, 可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 +Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术,可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 在优化器方面,Apache Doris 使用 CBO 和 RBO 结合的优化策略,RBO 支持常量折叠、子查询改写、谓词下推等,CBO 支持 Join Reorder。目前 CBO 还在持续优化中,主要集中在更加精准的统计信息收集和推导,更加精准的代价模型预估等方面。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/install/deploy-manually/storage-compute-coupled-deploy-manually.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/install/deploy-manually/storage-compute-coupled-deploy-manually.md index 3976195fae91a..4dbf8700de6ab 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/install/deploy-manually/storage-compute-coupled-deploy-manually.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/install/deploy-manually/storage-compute-coupled-deploy-manually.md @@ -1,6 +1,6 @@ --- { - "title": "手动部署存算一体集群", + "title": "手动部署集群", "language": "zh-CN" } --- diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/log-storage-analysis.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/log-storage-analysis.md similarity index 99% rename from i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/log-storage-analysis.md rename to i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/log-storage-analysis.md index 9669a0ea06df1..e5c7adcac3165 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-1.2/gettingStarted/tutorials/log-storage-analysis.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "构建日志存储与分析平台", + "title": "日志存储与分析", "language": "zh-CN" } --- @@ -218,13 +218,13 @@ Apache Doris 对 Flexible Schema 的日志数据提供了几个方面的支持 更多关于分区分桶的信息,可参考 [数据划分](../../table-design/data-partitioning/basic-concepts)。 **配置压缩参数** -- 使用 zstd 压缩算法(`"compression" = "zstd"`), 提高数据压缩率。 +- 使用 zstd 压缩算法 (`"compression" = "zstd"`), 提高数据压缩率。 **配置 Compaction 参数** 按照以下说明配置 Compaction 参数: -- 使用 time_series 策略(`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 +- 使用 time_series 策略 (`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 **建立和配置索引参数** diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md index d3f848618fd9a..186acb51820bd 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md @@ -1,6 +1,6 @@ --- { - "title": "并行执行", + "title": "Pipeline 执行引擎", "language": "zh-CN", "toc_min_heading_level": 2, "toc_max_heading_level": 4 @@ -28,71 +28,71 @@ under the License. -Doris的并行执行模型是一种Pipeline 执行模型,主要参考了[Hyper](https://db.in.tum.de/~leis/papers/morsels.pdf)论文中Pipeline的实现方式,Pipeline 执行模型能够充分释放多核 CPU 的计算能力,并对 Doris 的查询线程的数目进行限制,解决 Doris 的执行线程膨胀的问题。它的具体设计、实现和效果可以参阅 [DSIP-027](DSIP-027: Support Pipeline Exec Engine - DORIS - Apache Software Foundation) 以及 [DSIP-035](DSIP-035: PipelineX Execution Engine - DORIS - Apache Software Foundation)。 -Doris 3.0 之后,Pipeline 执行模型彻底替换了原有的火山模型,基于Pipeline 执行模型,Doris 实现了 Query、DDL、DML 语句的并行处理。 +Doris 的并行执行模型是一种 Pipeline 执行模型,主要参考了[Hyper](https://db.in.tum.de/~leis/papers/morsels.pdf)论文中 Pipeline 的实现方式,Pipeline 执行模型能够充分释放多核 CPU 的计算能力,并对 Doris 的查询线程的数目进行限制,解决 Doris 的执行线程膨胀的问题。它的具体设计、实现和效果可以参阅 [DSIP-027](DSIP-027: Support Pipeline Exec Engine - DORIS - Apache Software Foundation) 以及 [DSIP-035](DSIP-035: PipelineX Execution Engine - DORIS - Apache Software Foundation)。 +Doris 3.0 之后,Pipeline 执行模型彻底替换了原有的火山模型,基于 Pipeline 执行模型,Doris 实现了 Query、DDL、DML 语句的并行处理。 ## 物理计划 -为了更好的理解Pipeline 执行模型,首先需要介绍一下物理查询计划中两个重要的概念:PlanFragment和PlanNode。我们使用下面这条SQL 作为例子: +为了更好的理解 Pipeline 执行模型,首先需要介绍一下物理查询计划中两个重要的概念:PlanFragment 和 PlanNode。我们使用下面这条 SQL 作为例子: ``` SELECT k1, SUM(v1) FROM A,B WHERE A.k2 = B.k2 GROUP BY k1 ORDER BY SUM(v1); ``` -FE 首先会把它翻译成下面这种逻辑计划,计划中每个节点就是一个PlanNode,每种Node的具体含义,可以参考查看物理计划的介绍。 +FE 首先会把它翻译成下面这种逻辑计划,计划中每个节点就是一个 PlanNode,每种 Node 的具体含义,可以参考查看物理计划的介绍。 ![pip_exec_1](/images/pip_exec_1.png) -由于Doris 是一个MPP的架构,每个查询都会尽可能的让所有的BE 都参与进来并行执行,来降低查询的延时。所以还需要将上述逻辑计划拆分为一个物理计划,拆分物理计划基本上就是在逻辑计划中插入了DataSink和ExchangeNode,通过这两个Node完成了数据在多个BE 之间的Shuffle。拆分完成后,每个PlanFragment 相当于包含了一部分PlanNode,可以作为一个独立的任务发送给BE,每个BE 完成了PlanFragment内包含的PlanNode的计算后,通过DataSink和ExchangeNode 这两个算子把数据shuffle到其他BE上来进行接下来的计算。 +由于 Doris 是一个 MPP 的架构,每个查询都会尽可能的让所有的 BE 都参与进来并行执行,来降低查询的延时。所以还需要将上述逻辑计划拆分为一个物理计划,拆分物理计划基本上就是在逻辑计划中插入了 DataSink 和 ExchangeNode,通过这两个 Node 完成了数据在多个 BE 之间的 Shuffle。拆分完成后,每个 PlanFragment 相当于包含了一部分 PlanNode,可以作为一个独立的任务发送给 BE,每个 BE 完成了 PlanFragment 内包含的 PlanNode 的计算后,通过 DataSink 和 ExchangeNode 这两个算子把数据 shuffle 到其他 BE 上来进行接下来的计算。 ![pip_exec_2](/images/pip_exec_2.png) -所以Doris的规划分为3层: -PLAN:执行计划,一个SQL会被执行规划器翻译成一个执行计划,之后执行计划会提供给执行引擎执行。 +所以 Doris 的规划分为 3 层: +PLAN:执行计划,一个 SQL 会被执行规划器翻译成一个执行计划,之后执行计划会提供给执行引擎执行。 -FRAGMENT:由于DORIS是一个分布式执行引擎。一个完整的执行计划会被切分为多个单机的执行片段。一个FRAGMENT表是一个完整的单机执行片段。多个FRAGMENT组合在一起,构成一个完整的PLAN。 +FRAGMENT:由于 DORIS 是一个分布式执行引擎。一个完整的执行计划会被切分为多个单机的执行片段。一个 FRAGMENT 表是一个完整的单机执行片段。多个 FRAGMENT 组合在一起,构成一个完整的 PLAN。 -PLAN NODE:算子,是执行计划的最小单位。一个FRAGMENT由多个算子构成。每一个算子负责一个实际的执行逻辑,比如聚合,连接等 +PLAN NODE:算子,是执行计划的最小单位。一个 FRAGMENT 由多个算子构成。每一个算子负责一个实际的执行逻辑,比如聚合,连接等 ## Pipeline 执行 -PlanFragment 是FE 发往BE 执行任务的最小单位。BE可能会收到同一个Query的多个不同的PlanFragment,每个PlanFragment都会被单独的处理。在收到PlanFragment 之后,BE会把PlanFragment 拆分为多个Pipeline,进而启动多个PipelineTask 来实现并行执行,提升查询效率。 +PlanFragment 是 FE 发往 BE 执行任务的最小单位。BE 可能会收到同一个 Query 的多个不同的 PlanFragment,每个 PlanFragment 都会被单独的处理。在收到 PlanFragment 之后,BE 会把 PlanFragment 拆分为多个 Pipeline,进而启动多个 PipelineTask 来实现并行执行,提升查询效率。 ![pip_exec_3](/images/pip_exec_3.png) ### Pipeline -一个Pipeline 有一个SourceOperator 和 一个SinkOperator 以及中间的多个其他Operator组成。SourceOperator 代表从外部读取数据,可以是一个表(OlapTable),也可以是一个Buffer(Exchange)。SinkOperator 表示数据的输出,输出可以是通过网络shuffle到别的节点,比如DataStreamSinkOperator,也可以是输出到HashTable,比如Agg算子,JoinBuildHashTable等。 +一个 Pipeline 有一个 SourceOperator 和 一个 SinkOperator 以及中间的多个其他 Operator 组成。SourceOperator 代表从外部读取数据,可以是一个表(OlapTable),也可以是一个 Buffer(Exchange)。SinkOperator 表示数据的输出,输出可以是通过网络 shuffle 到别的节点,比如 DataStreamSinkOperator,也可以是输出到 HashTable,比如 Agg 算子,JoinBuildHashTable 等。 ![pip_exec_4](/images/pip_exec_4.png) -多个Pipeline 之间实际是有依赖关系的,以JoinNode为例,他实际被拆分到了2个Pipeline 里。其中Pipeline-0是读取Exchange的数据,来构建HashTable;Pipeline-1 是从表里读取数据,来进行Probe。这2个Pipeline 之间是有关联关系的,只有Pipeline-0运行完毕之后才能执行Pipeline-1。这两者之间的依赖关系,称为Dependency。当Pipeline-0 运行完毕后,会调用Dependency的set_ready 方法通知Pipeline-1 可执行。 +多个 Pipeline 之间实际是有依赖关系的,以 JoinNode 为例,他实际被拆分到了 2 个 Pipeline 里。其中 Pipeline-0 是读取 Exchange 的数据,来构建 HashTable;Pipeline-1 是从表里读取数据,来进行 Probe。这 2 个 Pipeline 之间是有关联关系的,只有 Pipeline-0 运行完毕之后才能执行 Pipeline-1。这两者之间的依赖关系,称为 Dependency。当 Pipeline-0 运行完毕后,会调用 Dependency 的 set_ready 方法通知 Pipeline-1 可执行。 ### PipelineTask -Pipeline 实际还是一个逻辑概念,他并不是一个可执行的实体。在有了Pipeline之后,需要进一步的把Pipeline 实例化为多个PipelineTask。将需要读取的数据分配给不同的PipelineTask 最终实现并行处理。同一个Pipeline的多个PipelineTask 之间的Operator 完全相同,他们的区别在于Operator的状态不一样,比如读取的数据不一样,构建出的HashTable 不一样,这些不一样的状态,我们称之为LocalState。 -每个PipelineTask 最终都会被提交到一个线程池中作为独立的任务执行。在Dependency 这种触发机制下,可以更好的利用多核CPU,实现充分的并行。 +Pipeline 实际还是一个逻辑概念,他并不是一个可执行的实体。在有了 Pipeline 之后,需要进一步的把 Pipeline 实例化为多个 PipelineTask。将需要读取的数据分配给不同的 PipelineTask 最终实现并行处理。同一个 Pipeline 的多个 PipelineTask 之间的 Operator 完全相同,他们的区别在于 Operator 的状态不一样,比如读取的数据不一样,构建出的 HashTable 不一样,这些不一样的状态,我们称之为 LocalState。 +每个 PipelineTask 最终都会被提交到一个线程池中作为独立的任务执行。在 Dependency 这种触发机制下,可以更好的利用多核 CPU,实现充分的并行。 ### Operator -在大多数时候,Pipeline 中的每个Operator 都对应了一个PlanNode,但是有一些特殊的算子除外: -- JoinNode,被拆分为JoinBuildOperator和JoinProbeOperator -- AggNode 被拆分为AggSinkOperator和AggSourceOperator -- SortNode 被拆分为SortSinkOperator 和 SortSourceOperator -基本原理是,对于一些breaking 算子(需要把所有的数据都收集齐之后才能运算的算子),把灌入数据的部分拆分为Sink,然后把从这个算子里获取数据的部分称为Source。 +在大多数时候,Pipeline 中的每个 Operator 都对应了一个 PlanNode,但是有一些特殊的算子除外: +- JoinNode,被拆分为 JoinBuildOperator 和 JoinProbeOperator +- AggNode 被拆分为 AggSinkOperator 和 AggSourceOperator +- SortNode 被拆分为 SortSinkOperator 和 SortSourceOperator +基本原理是,对于一些 breaking 算子(需要把所有的数据都收集齐之后才能运算的算子),把灌入数据的部分拆分为 Sink,然后把从这个算子里获取数据的部分称为 Source。 ## Scan 并行化 -扫描数据是一个非常重的IO 操作,它需要从本地磁盘读取大量的数据(如果是数据湖的场景,就需要从HDFS或者S3中读取,延时更长),需要比较多的时间。所以我们在ScanOperator 中引入了并行扫描的技术,ScanOperator会动态的生成多个Scanner,每个Scanner 扫描100w-200w 行左右的数据,每个Scanner 在做数据扫描时,完成相应的数据解压、过滤等计算任务,然后把数据发送给一个DataQueue,供ScanOperator 读取。 +扫描数据是一个非常重的 IO 操作,它需要从本地磁盘读取大量的数据(如果是数据湖的场景,就需要从 HDFS 或者 S3 中读取,延时更长),需要比较多的时间。所以我们在 ScanOperator 中引入了并行扫描的技术,ScanOperator 会动态的生成多个 Scanner,每个 Scanner 扫描 100w-200w 行左右的数据,每个 Scanner 在做数据扫描时,完成相应的数据解压、过滤等计算任务,然后把数据发送给一个 DataQueue,供 ScanOperator 读取。 ![pip_exec_5](/images/pip_exec_5.png) -通过并行扫描的技术可以有效的避免由于分桶不合理或者数据倾斜导致某些ScanOperator 执行时间特别久,把整个查询的延时都拖慢的问题。 +通过并行扫描的技术可以有效的避免由于分桶不合理或者数据倾斜导致某些 ScanOperator 执行时间特别久,把整个查询的延时都拖慢的问题。 ## Local Shuffle -在Pipeline执行模型中,Local Exchange作为一个Pipeline Breaker出现,是在本地将数据重新分发至各个执行任务的技术。它把上游Pipeline输出的全部数据以某种方式(HASH / Round Robin)均匀分发到下游Pipeline的全部Task中。解决执行过程中的数据倾斜的问题,使执行模型不再受数据存储以及plan的限制。接下来我们举例来说明Local Exchange的工作逻辑。 -我们用上述例子中的Pipeline-1为例子进一步阐述Local Exchange如何可以避免数据倾斜。 +在 Pipeline 执行模型中,Local Exchange 作为一个 Pipeline Breaker 出现,是在本地将数据重新分发至各个执行任务的技术。它把上游 Pipeline 输出的全部数据以某种方式(HASH / Round Robin)均匀分发到下游 Pipeline 的全部 Task 中。解决执行过程中的数据倾斜的问题,使执行模型不再受数据存储以及 plan 的限制。接下来我们举例来说明 Local Exchange 的工作逻辑。 +我们用上述例子中的 Pipeline-1 为例子进一步阐述 Local Exchange 如何可以避免数据倾斜。 ![pip_exec_6](/images/pip_exec_6.png) -如上图所示,首先,通过在Pipeline 1中插入Local Exchange,我们把Pipeline 1进一步拆分成Pipeline 1-0和Pipeline 1-1。 -此时,我们不妨假设当前并发等于3(每个Pipeline有3个task),每个task读取存储层的一个bucket,而3个bucket中数据行数分别是1,1,7。则插入Local Exchange前后的执行变化如下: +如上图所示,首先,通过在 Pipeline 1 中插入 Local Exchange,我们把 Pipeline 1 进一步拆分成 Pipeline 1-0 和 Pipeline 1-1。 +此时,我们不妨假设当前并发等于 3(每个 Pipeline 有 3 个 task),每个 task 读取存储层的一个 bucket,而 3 个 bucket 中数据行数分别是 1,1,7。则插入 Local Exchange 前后的执行变化如下: ![pip_exec_7](/images/pip_exec_7.png) -从图右可以看出,HashJoin和Agg算子需要处理的数据量从(1,1,7)变成了(3,3,3)从而避免了数据倾斜。 -在Doris中,Local Exchange根据一系列规则来决定是否被规划,例如当查询耗时比较大的Join、聚合、窗口函数等算子需要被执行时,我们就需要使用Local Exchange来尽可能避免数据倾斜。 \ No newline at end of file +从图右可以看出,HashJoin 和 Agg 算子需要处理的数据量从 (1,1,7) 变成了 (3,3,3) 从而避免了数据倾斜。 +在 Doris 中,Local Exchange 根据一系列规则来决定是否被规划,例如当查询耗时比较大的 Join、聚合、窗口函数等算子需要被执行时,我们就需要使用 Local Exchange 来尽可能避免数据倾斜。 \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/column-compression.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/column-compression.md index a6a4194b85a4d..7e56a5b6ac443 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/column-compression.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/column-compression.md @@ -1,6 +1,6 @@ --- { - "title": "按列压缩", + "title": "数据压缩", "language": "zh_CN" } --- @@ -44,10 +44,10 @@ Doris 支持多种压缩算法,每种算法在压缩率和解压速度之间 |-------------------------------|------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------| | **无压缩** | - 数据不进行压缩。 | 适用于不需要压缩的场景,例如数据已经被压缩或者存储空间不是问题的情况。 | | **LZ4** | - 压缩和解压速度非常快。
- 压缩比适中。 | 适用于对解压速度要求高的场景,如实时查询或高并发负载。 | -| **LZ4F (LZ4框架)** | - LZ4的扩展版本,支持更灵活的压缩配置。
- 速度快,压缩比适中。 | 适用于需要快速压缩并对配置有细粒度控制的场景。 | -| **LZ4HC (LZ4高压缩)** | - 相比LZ4有更高的压缩比,但压缩速度较慢。
- 解压速度与LZ4相当。 | 适用于需要更高压缩比的场景,同时仍然关注解压速度。 | +| **LZ4F (LZ4 框架)** | - LZ4 的扩展版本,支持更灵活的压缩配置。
- 速度快,压缩比适中。 | 适用于需要快速压缩并对配置有细粒度控制的场景。 | +| **LZ4HC (LZ4 高压缩)** | - 相比 LZ4 有更高的压缩比,但压缩速度较慢。
- 解压速度与 LZ4 相当。 | 适用于需要更高压缩比的场景,同时仍然关注解压速度。 | | **ZSTD (Zstandard)** | - 高压缩比,支持灵活的压缩级别调整。
- 即使在高压缩比下,解压速度仍然很快。 | 适用于对存储效率要求较高且需要平衡查询性能的场景。 | -| **Snappy** | - 设计重点是快速解压。
- 压缩比适中。 | 适用于对解压速度要求高且对CPU消耗低的场景。 | +| **Snappy** | - 设计重点是快速解压。
- 压缩比适中。 | 适用于对解压速度要求高且对 CPU 消耗低的场景。 | | **Zlib** | - 提供良好的压缩比与速度平衡。
- 与其他算法相比,压缩和解压速度较慢,但压缩比更高。 | 适用于对存储效率要求较高且对解压速度不敏感的场景,如归档和冷数据存储。 | diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/data-model/aggregate.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/data-model/aggregate.md index 8ea30edc81f66..79036e7c5f8f5 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/data-model/aggregate.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/data-model/aggregate.md @@ -88,7 +88,7 @@ DISTRIBUTED BY HASH(user_id) BUCKETS 10; * BITMAP_UNION:BIMTAP 类型的列的聚合方式,进行位图的并集聚合。 -:::info 提示: +:::info 提示: 如果以上的聚合方式无法满足业务需求,可以选择使用 agg_state 类型。 ::: @@ -129,7 +129,7 @@ SELECT * FROM example_tbl_agg; ## AGG_STATE -::: info 提示: +:::info 提示: AGG_STATE 是实验特性,建议在开发与测试环境中使用。 ::: diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/tiered-storage/overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/tiered-storage/overview.md index a0df890036e5a..2df6839366077 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/tiered-storage/overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.1/table-design/tiered-storage/overview.md @@ -1,6 +1,6 @@ --- { - "title": "分层存储", + "title": "冷热数据分层概述", "language": "zh-CN" } --- @@ -30,6 +30,6 @@ under the License. |--------------------|------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------| | **存算分离** | 用户具备部署存算分离的条件 | - 数据以单副本完全存储在对象存储中
- 通过本地缓存加速热数据访问
- 存储与计算资源独立扩展,显著降低存储成本 | | **本地分层** | 存算一体模式下,用户希望进一步优化本地存储资源 | - 支持将冷数据从 SSD 冷却到 HDD
- 充分利用本地存储层级特性,节省高性能存储成本 | -| **远程分层** | 存算一体模式下,使用廉价的对象存储或者 HDFS 进一步降低成本 | - 冷数据以单副本形式保存到对象存储或者 HDFS中
- 热数据继续使用本地存储
- 不能对一个表和本地分层混合使用 | +| **远程分层** | 存算一体模式下,使用廉价的对象存储或者 HDFS 进一步降低成本 | - 冷数据以单副本形式保存到对象存储或者 HDFS 中
- 热数据继续使用本地存储
- 不能对一个表和本地分层混合使用 | 通过上述模式,Doris 能够灵活适配用户的部署条件,实现查询效率与存储成本的平衡。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0.json b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0.json index 0acf99b0fd10a..c0f94005c79ad 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0.json +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0.json @@ -59,9 +59,9 @@ "message": "部署存算分离集群", "description": "The label for category Deploying on Kubernetes in sidebar docs" }, - "sidebar.docs.category.Deployment on Cloud": { + "sidebar.docs.category.Deploying on Cloud": { "message": "云上部署集群", - "description": "The label for category Deployment on Cloud in sidebar docs" + "description": "The label for category Deploying on Cloud in sidebar docs" }, "sidebar.docs.category.Database Connection": { "message": "数据库连接", @@ -672,7 +672,7 @@ "description": "The label for category Cross Cluster Replication in sidebar docs" }, "sidebar.docs.category.Tiered Storage": { - "message": "分层存储", + "message": "冷热数据分层", "description": "The label for category Tiered Storage in sidebar docs" }, "sidebar.docs.category.Business Continuity & Data Recovery": { diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/data-operate/delete/delete-overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/data-operate/delete/delete-overview.md index 74b6a6049a5b6..395783dc90d7a 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/data-operate/delete/delete-overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/data-operate/delete/delete-overview.md @@ -28,7 +28,7 @@ under the License. ## 删除的实现机制 -Doris 的删除操作采用**标记删除(Logical Deletion)**的方式,而不是直接物理删除数据。以下是其核心实现机制: +Doris 的删除操作采用**标记删除(Logical Deletion)** 的方式,而不是直接物理删除数据。以下是其核心实现机制: 1. **标记删除**。删除操作不会直接从存储中移除数据,而是为目标数据添加一条删除标记。标记删除主要有两种实现方式:delete 谓词和 delete sign。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/data-operate/update/unique-update-concurrent-control.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/data-operate/update/unique-update-concurrent-control.md index 7fb631fc893d1..f46d3170707e8 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/data-operate/update/unique-update-concurrent-control.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/data-operate/update/unique-update-concurrent-control.md @@ -28,7 +28,7 @@ under the License. Doris 采用多版本并发控制机制(MVCC - Multi-Version Concurrency Control)来管理并发更新。每次数据写入操作均会分配一个写入事务,该事务确保数据写入的原子性(即写入操作要么完全成功,要么完全失败)。在写入事务提交时,系统会为其分配一个版本号。当用户使用 Unique Key 模型并多次导入数据时,如果存在重复主键,Doris 会根据版本号确定覆盖顺序:版本号较高的数据会覆盖版本号较低的数据。 -在某些场景中,用户可能需要通过在建表语句中指定 sequence 列来灵活调整数据的生效顺序。例如,当通过多线程并发同步数据到 Doris 时,不同线程的数据可能会乱序到达。这种情况下,可能出现旧数据因较晚到达而错误覆盖新数据的情况。为解决这一问题,用户可以为旧数据指定较低的 sequence 值,为新数据指定较高的 sequence 值,从而让 Doris 根据用户提供的 sequence值来正确确定数据的更新顺序。 +在某些场景中,用户可能需要通过在建表语句中指定 sequence 列来灵活调整数据的生效顺序。例如,当通过多线程并发同步数据到 Doris 时,不同线程的数据可能会乱序到达。这种情况下,可能出现旧数据因较晚到达而错误覆盖新数据的情况。为解决这一问题,用户可以为旧数据指定较低的 sequence 值,为新数据指定较高的 sequence 值,从而让 Doris 根据用户提供的 sequence 值来正确确定数据的更新顺序。 此外,`UPDATE` 语句与通过导入实现更新在底层机制上存在较大差异。`UPDATE` 操作涉及两个步骤:从数据库中读取待更新的数据,以及写入更新后的数据。默认情况下,`UPDATE` 语句通过表级锁提供了 Serializable 隔离级别的事务能力,即多个 `UPDATE` 操作只能串行执行。用户也可以通过调整配置绕过这一限制,具体方法请参阅以下章节的详细说明。 @@ -72,7 +72,7 @@ sequence 列目前只支持 Unique 模型。 **Sequence 列建表时有两种方式,一种是建表时设置`sequence_col`属性,一种是建表时设置`sequence_type`属性。** -**1. 设置****`sequence_col`(推荐)** +**1. 设置 `sequence_col`(推荐)** 创建 Unique 表时,指定 sequence 列到表中其他 column 的映射 @@ -86,7 +86,7 @@ sequence_col 用来指定 sequence 列到表中某一列的映射,该列可以 导入方式和没有 sequence 列时一样,使用相对比较简单,推荐使用。 -**2. 设置****`sequence_type`** +**2. 设置 `sequence_type`** 创建 Unique 表时,指定 sequence 列类型 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/db-connect/database-connect.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/db-connect/database-connect.md index d781e92299414..47a2ed6066717 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/db-connect/database-connect.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/db-connect/database-connect.md @@ -1,6 +1,6 @@ --- { - "title": "数据库连接", + "title": "通过 MySQL 协议连接", "language": "zh-CN" } --- diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/gettingStarted/tutorials/log-storage-analysis.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/gettingStarted/tutorials/log-storage-analysis.md deleted file mode 100644 index 9669a0ea06df1..0000000000000 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/gettingStarted/tutorials/log-storage-analysis.md +++ /dev/null @@ -1,565 +0,0 @@ ---- -{ - "title": "构建日志存储与分析平台", - "language": "zh-CN" -} ---- - - - -日志是系统运行的详细记录,包含各种事件发生的主体、时间、位置、内容等关键信息。出于运维可观测、网络安全监控及业务分析等多重需求,企业通常需要将分散的日志采集起来,进行集中存储、查询和分析,以进一步从日志数据里挖掘出有价值的内容。 - -针对此场景,Apache Doris 提供了相应解决方案,针对日志场景的特点,增加了倒排索引和极速全文检索能力,极致优化写入性能和存储空间,使得用户可以基于 Apache Doris 构建开放、高性能、低成本、统一的日志存储与分析平台。 - -本文将围绕这一解决方案,介绍以下内容: - -- **整体架构**:说明基于 Apache Doris 构建的日志存储与分析平台的核心组成部分和基础架构。 -- **特点与优势**:说明基于 Apache Doris 构建的日志存储与分析平台的特点和优势。 -- **操作指南**:说明如何基于 Apache Doris 构建日志存储分析平台。 - -## 整体架构 - -基于 Apache Doris 构建的日志存储与分析平台的架构如下图: - -![Overall architecture](/images/doris-overall-architecture.png) - -此架构主要由 3 大部分组成: - -- **日志采集和预处理**:多种日志采集工具可以通过 HTTP APIs 将日志数据写入 Apache Doris。 -- **日志存储和分析引擎**:Apache Doris 提供高性能、低成本的统一日志存储,通过 SQL 接口提供丰富的检索分析能力。 -- **日志分析和告警界面**:多种日志检索分析通工具通过标准 SQL 接口查询 Apache Doris,为用户提供简单易用的界面。 - -## 特点与优势 - -基于 Apache Doris 构建的日志存储与分析平台的特点和优势如下: - -- **高吞吐、低延迟日志写入**:支持每天百 TB 级、GB/s 级日志数据持续稳定写入,同时保持延迟 1s 以内。 -- **海量日志数据低成本存储**:支持 PB 级海量存储,相对于 Elasticsearch 存储成本节省 60% 到 80%,支持冷数据存储到 S3/HDFS,存储成本再降 50%。 -- **高性能日志全文检索分析**:支持倒排索引和全文检索,日志场景常见查询(关键词检索明细、趋势分析等)秒级响应。 -- **开放、易用的上下游生态**:上游通过 Stream Load 通用 HTTP APIs 对接常见的日志采集系统和数据源 Logstash、Filebeat、Fluentbit、Kafka 等,下游通过标准 MySQL 协议和语法对接各种可视化分析 UI,比如可观测性 Grafana、BI 分析 Superset、类 Kibana 的日志检索 Doris WebUI。 - -### 高性能、低成本 - -经过 Benchmark 测试及生产验证,基于 Apache Doris 构建的日志存储与分析平台,性价比相对于 Elasticsearch 具有 5~10 倍的提升。Apache Doris 的性能优势,主要得益于全球领先的高性能存储和查询引擎,以及下面一些针对日志场景的专门优化: - -- **写入吞吐提升**:Elasticsearch 写入的性能瓶颈在于解析数据和构建倒排索引的 CPU 消耗。相比之下,Apache Doris 进行了两方面的写入优化:一方面利用 SIMD 等 CPU 向量化指令提升了 JSON 数据解析速度和索引构建性能;另一方面针对日志场景简化了倒排索引结构,去掉日志场景不需要的正排等数据结构,有效降低了索引构建的复杂度。同样的资源,Apache Doris 的写入性能是 Elasticsearch 的 3~5 倍。 -- **存储成本降低**:Elasticsearch 存储瓶颈在于正排、倒排、Docvalue 列存多份存储和通用压缩算法压缩率较低。相比之下,Apache Doris 在存储上进行了以下优化:去掉正排,缩减了 30% 的索引数据量;采用列式存储和 Zstandard 压缩算法,压缩比可达到 5~10 倍,远高于 Elasticsearch 的 1.5 倍;日志数据中冷数据访问频率很低,Apache Doris 冷热分层功能可以将超过定义时间段的日志自动存储到更低的对象存储中,冷数据的存储成本可降低 70% 以上。同样的原始数据,Doris 的存储成本只需要 Elasticsearch 的 20% 左右。 -- **查询性能提升**:Apache Doris 将全文检索的流程简化,跳过了相关性打分等日志场景不需要的算法,加速基础的检索性能。同时针对日志场景常见的查询,比如查询包含某个关键字的最新 100 条日志,在查询规划和执行上做专门的 TopN 动态剪枝等优化。 - -### 分析能力强 - -Apache Doris 支持标准 SQL、兼容 MySQL 协议和语法,因此基于 Apache Doris 构建的日志系统能够使用 SQL 进行日志分析,这使得日志系统具备以下优势: - -- **简单易用**:工程师和数据分析师对于 SQL 非常熟悉,经验可以复用,不需要学习新的技术栈即可快速上手。 -- **生态丰富**:MySQL 生态是数据库领域使用最广泛的语言,因此可以与 MySQL 生态的集成和应用无缝衔接。Doris 可以利用 MySQL 命令行与各种 GUI 工具、BI 工具等大数据生态结合,实现更复杂及多样化的数据处理分析需求。 -- **分析能力强**:SQL 语言已经成为数据库和大数据分析的事实标准,它具有强大的表达能力和功能,支持检索、聚合、多表 JOIN、子查询、UDF、逻辑视图、物化视图等多种数据分析能力。 - -### Flexible Schema - -下面是一个典型的 JSON 格式半结构化日志样例。顶层字段是一些比较固定的字段,比如日志时间戳(`timestamp`),日志来源(`source`),日志所在机器(`node`),打日志的模块(`component`),日志级别(`level`),客户端请求标识(`clientRequestId`),日志内容(`message`),日志扩展属性(`properties`),基本上每条日志都会有。而扩展属性 `properties` 的内部嵌套字段 `properties.size`、`properties.format` 等是比较动态的,每条日志的字段可能不一样。 - -```JSON -{ - "timestamp": "2014-03-08T00:50:03.8432810Z", - "source": "ADOPTIONCUSTOMERS81", - "node": "Engine000000000405", - "level": "Information", - "component": "DOWNLOADER", - "clientRequestId": "671db15d-abad-94f6-dd93-b3a2e6000672", - "message": "Downloading file path: benchmark/2014/ADOPTIONCUSTOMERS81_94_0.parquet.gz", - "properties": { - "size": 1495636750, - "format": "parquet", - "rowCount": 855138, - "downloadDuration": "00:01:58.3520561" - } -} -``` - -Apache Doris 对 Flexible Schema 的日志数据提供了几个方面的支持: - -- 对于顶层字段的少量变化,可以通过 Light Schema Change 发起 ADD / DROP COLUMN 增加 / 删除列,ADD / DROP INDEX 增加 / 删除索引,能够在秒级完成 Schema 变更。用户在日志平台规划时只需考虑当前需要哪些字段创建索引。 -- 对于类似 `properties` 的扩展字段,提供了原生半结构化数据类型 `VARIANT`,可以写入任何 JSON 数据,自动识别 JSON 中的字段名和类型,并自动拆分频繁出现的字段采用列式存储,以便于后续的分析,还可以对 `VARIANT` 创建倒排索引,加快内部字段的查询和检索。 - -相对于 Elasticsearch 的 Dynamic Mapping,Apache Doris 的 Flexible Schema 有以下优势: - -- 允许一个字段有多种类型,`VARIANT` 自动对字段类型做冲突处理和类型提升,更好地适应日志数据的迭代变化。 -- `VARIANT` 自动将不频繁出现的字段合并成一个列存储,可避免字段、元数据、列过多导致性能问题。 -- 不仅可以动态加列,还可以动态删列、动态增加索引、动态删索引,无需像 Elasticsearch 在一开始对所有字段建索引,减少不必要的成本。 - -## 操作指南 - -### 第 1 步:评估资源 - -在部署集群之前,首先应评估所需服务器硬件资源,包括以下几个关键步骤: - -1. **评估写入资源**:计算公式如下: - - - `平均写入吞吐 = 日增数据量 / 86400 s` - - `峰值写入吞吐 = 平均写入吞吐 * 写入吞吐峰值 / 均值比` - - `峰值写入所需 CPU 核数 = 峰值写入吞吐 / 单核写入吞吐` - -2. **评估存储资源**:计算公式为 `所需存储空间 = 日增数据量 / 压缩率 * 副本数 * 数据存储周期` - -3. **评估查询资源**:查询的资源消耗随查询量和复杂度而异,建议初始预留 50% 的 CPU 资源用于查询,再根据实际测试情况进行调整。 - -4. **汇总整合资源**:由第 1 步和第 3 步估算出所需 CPU 核数后,除以单机 CPU 核数,估算出 BE 服务器数量,再根据 BE 服务器数量和第 2 步的结果,估算出每台 BE 服务器所需存储空间,然后分摊到 4~12 块数据盘,计算出单盘存储容量。 - -以每天新增 100 TB 数据量(压缩前)、5 倍压缩率、1 副本、热数据存储 3 天、冷数据存储 30 天、写入吞吐峰值 / 均值比 200%、单核写入吞吐 10 MB/s、查询预留 50% CPU 资源为例,可估算出: - -- FE:3 台服务器,每台配置 16 核 CPU、64 GB 内存、1 块 100 GB SSD 盘 -- BE:15 台服务器,每台配置 32 核 CPU、256 GB 内存、10 块 600 GB SSD 盘 -- S3 对象存储空间:即为预估冷数据存储空间,600 TB - -该例子中,各关键指标的值及具体计算方法可见下表: - -| 关键指标(单位) | 值 | 说明 | -| :------------------------------- | :---- | :----------------------------------------------------------- | -| 日增数据量(TB) | 100 | 根据实际需求填写 | -| 压缩率 | 5 | 一般为 3~10 倍(含索引),根据实际需求填写 | -| 副本数 | 1 | 根据实际需求填写,默认 1 副本,可选值:1,2,3 | -| 热数据存储周期(天) | 3 | 根据实际需求填写 | -| 冷数据存储周期(天) | 30 | 根据实际需求填写 | -| 总存储周期(天) | 33 | 算法:`热数据存储周期 + 冷数据存储周期` | -| 预估热数据存储空间(TB) | 60 | 算法:`日增数据量 / 压缩率 * 副本数 * 热数据存储周期` | -| 预估冷数据存储空间(TB) | 600 | 算法:`日增数据量 / 压缩率 * 副本数 * 冷数据存储周期` | -| 写入吞吐峰值 / 均值比 | 200% | 根据实际需求填写,默认 200% | -| 单机 CPU 核数 | 32 | 根据实际需求填写,默认 32 核 | -| 平均写入吞吐(MB/s) | 1214 | 算法:`日增数据量 / 86400 s` | -| 峰值写入吞吐(MB/s) | 2427 | 算法:`平均写入吞吐 * 写入吞吐峰值 / 均值比` | -| 峰值写入所需 CPU 核数 | 242.7 | 算法:`峰值写入吞吐 / 单核写入吞吐` | -| 查询预留 CPU 百分比 | 50% | 根据实际需求填写,默认 50% | -| 预估 BE 服务器数 | 15.2 | 算法:`峰值写入所需 CPU 核数 / 单机 CPU 核数 /(1 - 查询预留 CPU 百分比)` | -| 预估 BE 服务器数取整 | 15 | 算法:`MAX (副本数,预估 BE 服务器数取整)` | -| 预估每台 BE 服务器存储空间(TB) | 5.7 | 算法:`预估热数据存储空间 / 预估 BE 服务器数 /(1 - 30%)`,其中,30% 是存储空间预留值。建议每台 BE 服务器挂载 4~12 块数据盘,以提高 I/O 能力。 | - -### 第 2 步:部署集群 - -完成资源评估后,可以开始部署 Apache Doris 集群,推荐在物理机及虚拟机环境中进行部署。手动部署集群,可参考 [手动部署](../../install/cluster-deployment/standard-deployment)。 - -### 第 3 步:优化 FE 和 BE 配置 - -完成集群部署后,需分别优化 FE 和 BE 配置参数,以更加契合日志存储与分析的场景。 - -**优化 FE 配置** - -在 `fe/conf/fe.conf` 目录下找到 FE 的相关配置项,并按照以下表格,调整 FE 配置。 - -| 需调整参数 | 说明 | -| :----------------------------------------------------------- | :----------------------------------------------------------- | -| `max_running_txn_num_per_db = 10000` | 高并发导入运行事务数较多,需调高参数。 | -| `streaming_label_keep_max_second = 3600` `label_keep_max_second = 7200` | 高频导入事务标签内存占用多,保留时间调短。 | -| `enable_round_robin_create_tablet = true` | 创建 Tablet 时,采用 Round Robin 策略,尽量均匀。 | -| `tablet_rebalancer_type = partition` | 均衡 Tablet 时,采用每个分区内尽量均匀的策略。 | -| `autobucket_min_buckets = 10` | 将自动分桶的最小分桶数从 1 调大到 10,避免日志量增加时分桶不够。 | -| `max_backend_heartbeat_failure_tolerance_count = 10` | 日志场景下 BE 服务器压力较大,可能短时间心跳超时,因此将容忍次数从 1 调大到 10。 | - -更多关于 FE 配置项的信息,可参考 [FE 配置项](../../admin-manual/config/fe-config)。 - -**优化 BE 配置** - -在 `be/conf/be.conf` 目录下找到 BE 的相关配置项,并按照以下表格,调整 BE 配置。 - -| 模块 | 需调整参数 | 说明 | -| :--------- | :----------------------------------------------------------- | :----------------------------------------------------------- | -| 存储 | `storage_root_path = /path/to/dir1;/path/to/dir2;...;/path/to/dir12` | 配置热数据在磁盘目录上的存储路径。 | -| - | `enable_file_cache = true` | 开启文件缓存。 | -| - | `file_cache_path = [{"path": "/mnt/datadisk0/file_cache", "total_size":53687091200, "query_limit": "10737418240"},{"path": "/mnt/datadisk1/file_cache", "total_size":53687091200,"query_limit": "10737418240"}]` | 配置冷数据的缓存路径和相关设置,具体配置说明如下:
`path`:缓存路径
`total_size`:该缓存路径的总大小,单位为字节,53687091200 字节等于 50 GB
`query_limit`:单次查询可以从缓存路径中查询的最大数据量,单位为字节,10737418240 字节等于 10 GB | -| 写入 | `write_buffer_size = 1073741824` | 增加写入缓冲区(buffer)的文件大小,减少小文件和随机 I/O 操作,提升性能。 | -| - | `max_tablet_version_num = 20000` | 配合建表的 time_series compaction 策略,允许更多版本暂时未合并。 | -| Compaction | `max_cumu_compaction_threads = 8` | 设置为 CPU 核数 / 4,意味着 CPU 资源的 1/4 用于写入,1/4 用于后台 Compaction,2/1 留给查询和其他操作。 | -| - | `inverted_index_compaction_enable = true` | 开启索引合并(index compaction),减少 Compaction 时的 CPU 消耗。 | -| - | `enable_segcompaction = false` `enable_ordered_data_compaction = false` | 关闭日志场景不需要的两个 Compaction 功能。 | -| - | `enable_compaction_priority_scheduling = false` | 低优先级 compaction 在一块盘上限制 2 个任务,会影响 compaction 速度。 | -| - | `total_permits_for_compaction_score = 200000 ` | 该参数用来控制内存,time series 策略下本身可以控制内存。 | -| 缓存 | `disable_storage_page_cache = true` `inverted_index_searcher_cache_limit = 30%` | 因为日志数据量较大,缓存(cache)作用有限,因此关闭数据缓存,调换为索引缓存(index cache)的方式。 | -| - | `inverted_index_cache_stale_sweep_time_sec = 3600` `index_cache_entry_stay_time_after_lookup_s = 3600` | 让索引缓存在内存中尽量保留 1 小时。 | -| - | `enable_inverted_index_cache_on_cooldown = true`
`enable_write_index_searcher_cache = false` | 开启索引上传冷数据存储时自动缓存的功能。 | -| - | `tablet_schema_cache_recycle_interval = 3600` `segment_cache_capacity = 20000` | 减少其他缓存对内存的占用。 | -| - | `inverted_index_ram_dir_enable = true` | 减少写入时索引临时文件带来的 IO 开销。| -| 线程 | `pipeline_executor_size = 24` `doris_scanner_thread_pool_thread_num = 48` | 32 核 CPU 的计算线程和 I/O 线程配置,根据核数等比扩缩。 | -| - | `scan_thread_nice_value = 5` | 降低查询 I/O 线程的优先级,保证写入性能和时效性。 | -| 其他 | `string_type_length_soft_limit_bytes = 10485760` | 将 String 类型数据的长度限制调高至 10 MB。 | -| - | `trash_file_expire_time_sec = 300` `path_gc_check_interval_second = 900` `path_scan_interval_second = 900` | 调快垃圾文件的回收时间。 | - -更多关于 BE 配置项的信息,可参考 [BE 配置项](../../admin-manual/config/be-config)。 - -### 第 4 步:建表 - -由于日志数据的写入和查询都具备明显的特征,因此,在建表时按照本节说明进行针对性配置,以提升性能表现。 - -**配置分区分桶参数** - -分区时,按照以下说明配置: -- 使用时间字段上的 [Range 分区](../../table-design/data-partition/#range-%E5%88%86%E5%8C%BA) (`PARTITION BY RANGE(`ts`)`),并开启 [动态分区](../../table-design/data-partition) (`"dynamic_partition.enable" = "true"`),按天自动管理分区。 -- 使用 Datetime 类型的时间字段作为 Key (`DUPLICATE KEY(ts)`),在查询最新 N 条日志时有数倍加速。 - -分桶时,按照以下说明配置: -- 分桶数量大致为集群磁盘总数的 3 倍,每个桶的数据量压缩后 5GB 左右。 -- 使用 Random 策略 (`DISTRIBUTED BY RANDOM BUCKETS 60`),配合写入时的 Single Tablet 导入,可以提升批量(Batch)写入的效率。 - -更多关于分区分桶的信息,可参考 [数据划分](../../table-design/data-partitioning/basic-concepts)。 - -**配置压缩参数** -- 使用 zstd 压缩算法(`"compression" = "zstd"`), 提高数据压缩率。 - -**配置 Compaction 参数** - -按照以下说明配置 Compaction 参数: - -- 使用 time_series 策略(`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 - -**建立和配置索引参数** - -按照以下说明操作: - -- 对经常查询的字段建立索引 (`USING INVERTED`)。 -- 对需要全文检索的字段,将分词器(parser)参数赋值为 unicode,一般能满足大部分需求。如有支持短语查询的需求,将 support_phrase 参数赋值为 true;如不需要,则设置为 false,以降低存储空间。 - -**配置存储策略** - -按照以下说明操作: - -- 对于热存储数据,如果使用云盘,可配置 1 副本;如果使用物理盘,则至少配置 2 副本 (`"replication_num" = "2"`)。 -- 配置 `log_s3` 的存储位置 (`CREATE RESOURCE "log_s3"`),并设置 `log_policy_3day` 冷热数据分层策略 (`CREATE STORAGE POLICY log_policy_3day`),即在超过 3 天后将数据冷却至 `log_s3` 指定的存储位置。可参考以下代码: - -```sql -CREATE DATABASE log_db; -USE log_db; - -CREATE RESOURCE "log_s3" -PROPERTIES -( - "type" = "s3", - "s3.endpoint" = "your_endpoint_url", - "s3.region" = "your_region", - "s3.bucket" = "your_bucket", - "s3.root.path" = "your_path", - "s3.access_key" = "your_ak", - "s3.secret_key" = "your_sk" -); - -CREATE STORAGE POLICY log_policy_3day -PROPERTIES( - "storage_resource" = "log_s3", - "cooldown_ttl" = "259200" -); - -CREATE TABLE log_table -( - `ts` DATETIME, - `host` TEXT, - `path` TEXT, - `message` TEXT, - INDEX idx_host (`host`) USING INVERTED, - INDEX idx_path (`path`) USING INVERTED, - INDEX idx_message (`message`) USING INVERTED PROPERTIES("parser" = "unicode", "support_phrase" = "true") -) -ENGINE = OLAP -DUPLICATE KEY(`ts`) -PARTITION BY RANGE(`ts`) () -DISTRIBUTED BY RANDOM BUCKETS 60 -PROPERTIES ( - "compression" = "zstd", - "compaction_policy" = "time_series", - "dynamic_partition.enable" = "true", - "dynamic_partition.create_history_partition" = "true", - "dynamic_partition.time_unit" = "DAY", - "dynamic_partition.start" = "-30", - "dynamic_partition.end" = "1", - "dynamic_partition.prefix" = "p", - "dynamic_partition.buckets" = "60", - "dynamic_partition.replication_num" = "2", -- 存算分离不需要 - "replication_num" = "2", -- 存算分离不需要 - "storage_policy" = "log_policy_3day" -- 存算分离不需要 -); -``` - -### 第 5 步:采集日志 - -完成建表后,可进行日志采集。 - -Apache Doris 提供开放、通用的 Stream HTTP APIs,通过这些 APIs,你可与常用的日志采集器打通,包括 Logstash、Filebeat、Kafka 等,从而开展日志采集工作。本节介绍了如何使用 Stream HTTP APIs 对接日志采集器。 - -**对接 Logstash** - -按照以下步骤操作: - -1. 下载并安装 Logstash Doris Output 插件。你可选择以下两种方式之一: - -- 直接下载:[点此下载](https://apache-doris-releases.oss-accelerate.aliyuncs.com/logstash-output-doris-1.0.0.gem)。 - -- 从源码编译,并运行下方命令安装: - -```sql -./bin/logstash-plugin install logstash-output-doris-1.0.0.gem -``` - -2. 配置 Logstash。需配置以下参数: - -- `logstash.yml`:配置 Logstash 批处理日志的条数和时间,用于提升数据写入性能。 - -```sql -pipeline.batch.size: 1000000 -pipeline.batch.delay: 10000 -``` - - -- `logstash_demo.conf`:配置所采集日志的具体输入路径和输出到 Apache Doris 的设置。 - -```sql -input { - file { - path => "/path/to/your/log" - } -} - -output { - doris { - http_hosts => [ "", "", "] - user => "your_username" - password => "your_password" - db => "your_db" - table => "your_table" - - # doris stream load http headers - headers => { - "format" => "json" - "read_json_by_line" => "true" - "load_to_single_tablet" => "true" - } - - # field mapping: doris fileld name => logstash field name - # %{} to get a logstash field, [] for nested field such as [host][name] for host.name - mapping => { - "ts" => "%{@timestamp}" - "host" => "%{[host][name]}" - "path" => "%{[log][file][path]}" - "message" => "%{message}" - } - log_request => true - log_speed_interval => 10 - } -} -``` - -3. 按照下方命令运行 Logstash,采集日志并输出至 Apache Doris。 - -```shell -./bin/logstash -f logstash_demo.conf -``` - -更多关于 Logstash 配置和使用的说明,可参考 [Logstash Doris Output Plugin](../../ecosystem/logstash)。 - -**对接 Filebeat** - -按照以下步骤操作: - -1. 获取支持输出至 Apache Doris 的 Filebeat 二进制文件。可 [点此下载](https://apache-doris-releases.oss-accelerate.aliyuncs.com/filebeat-doris-1.0.0) 或者从 Apache Doris 源码编译。 -2. 配置 Filebeat。需配置以下参数: - -- `filebeat_demo.yml`:配置所采集日志的具体输入路径和输出到 Apache Doris 的设置。 - - ```yaml - # input - filebeat.inputs: - - type: log - enabled: true - paths: - - /path/to/your/log - # multiline 可以将跨行的日志(比如 Java stacktrace)拼接起来 - multiline: - type: pattern - # 效果:以 yyyy-mm-dd HH:MM:SS 开头的行认为是一条新的日志,其他都拼接到上一条日志 - pattern: '^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' - negate: true - match: after - skip_newline: true - - processors: - # 用 js script 插件将日志中的 \t 替换成空格,避免 JSON 解析报错 - - script: - lang: javascript - source: > - function process(event) { - var msg = event.Get("message"); - msg = msg.replace(/\t/g, " "); - event.Put("message", msg); - } - # 用 dissect 插件做简单的日志解析 - - dissect: - # 2024-06-08 18:26:25,481 INFO (report-thread|199) [ReportHandler.cpuReport():617] begin to handle - tokenizer: "%{day} %{time} %{log_level} (%{thread}) [%{position}] %{content}" - target_prefix: "" - ignore_failure: true - overwrite_keys: true - - # queue and batch - queue.mem: - events: 1000000 - flush.min_events: 100000 - flush.timeout: 10s - - # output - output.doris: - fenodes: [ "http://fehost1:http_port", "http://fehost2:http_port", "http://fehost3:http_port" ] - user: "your_username" - password: "your_password" - database: "your_db" - table: "your_table" - # output string format - ## %{[agent][hostname]} %{[log][file][path]} 是filebeat自带的metadata - ## 常用的 filebeat metadata 还是有采集时间戳 %{[@timestamp]} - ## %{[day]} %{[time]} 是上面 dissect 解析得到字段 - codec_format_string: '{"ts": "%{[day]} %{[time]}", "host": "%{[agent][hostname]}", "path": "%{[log][file][path]}", "message": "%{[message]}"}' - headers: - format: "json" - read_json_by_line: "true" - load_to_single_tablet: "true" - ``` - -3. 按照下方命令运行 Filebeat,采集日志并输出至 Apache Doris。 - -```shell -chmod +x filebeat-doris-1.0.0 -./filebeat-doris-1.0.0 -c filebeat_demo.yml -``` - -更多关于 Filebeat 配置和使用的说明,可参考 [Beats Doris Output Plugin](../../ecosystem/beats)。 - -**对接 Kafka** - -将 JSON 格式的日志写入 Kafka 的消息队列,创建 Kafka Routine Load,即可让 Apache Doris 从 Kafka 主动拉取数据。 - -可参考如下示例。其中,`property.*` 是 Librdkafka 客户端相关配置,根据实际 Kafka 集群情况配置。 - -```sql --- 准备好 kafka 集群和 topic log__topic_ --- 创建 routine load,从 kafka log__topic_将数据导入 log_table 表 -CREATE ROUTINE LOAD load_log_kafka ON log_db.log_table -COLUMNS(ts, clientip, request, status, size) -PROPERTIES ( -"max_batch_interval" = "10", -"max_batch_rows" = "1000000", -"max_batch_size" = "109715200", -"load_to_single_tablet" = "true", -"timeout" = "600", -"strict_mode" = "false", -"format" = "json" -) -FROM KAFKA ( -"kafka_broker_list" = "host:port", -"kafka_topic" = "log__topic_", -"property.group.id" = "your_group_id", -"property.security.protocol"="SASL_PLAINTEXT", -"property.sasl.mechanism"="GSSAPI", -"property.sasl.kerberos.service.name"="kafka", -"property.sasl.kerberos.keytab"="/path/to/xxx.keytab", -"property.sasl.kerberos.principal"="" -); --- 查看 routine 的状态 -SHOW ROUTINE LOAD; -``` - -更多关于 Kafka 配置和使用的说明,可参考 [Routine Load](../../data-operate/import/import-way/routine-load-manual)。 - -**使用自定义程序采集日志** - -除了对接常用的日志采集器以外,你也可以自定义程序,通过 HTTP API Stream Load 将日志数据导入 Apache Doris。参考以下代码: - -```shell -curl ---location-trusted --u username:password --H "format:json" --H "read_json_by_line:true" --H "load_to_single_tablet:true" --H "timeout:600" --T logfile.json -http://fe_host:fe_http_port/api/log_db/log_table/_stream_load -``` - -在使用自定义程序时,需注意以下关键点: - -- 使用 Basic Auth 进行 HTTP 鉴权,用命令 `echo -n 'username:password' | base64` 进行计算。 -- 设置 HTTP header "format:json",指定数据格式为 JSON。 -- 设置 HTTP header "read_json_by_line:true",指定每行一个 JSON。 -- 设置 HTTP header "load_to_single_tablet:true",指定一次导入写入一个分桶减少导入的小文件。 -- 建议写入客户端一个 Batch 的大小为 100MB ~ 1GB。如果你使用的是 Apache Doris 2.1 及更高版本,需通过服务端 Group Commit 功能,降低客户端 Batch 大小。 - -### 第 6 步:查询和分析日志 - -**日志查询** - -Apache Doris 支持标准 SQL,因此,你可以通过 MySQL 客户端或者 JDBC 等方式连接到集群,执行 SQL 进行日志查询。参考以下命令: - -``` -mysql -h fe_host -P fe_mysql_port -u your_username -Dyour_db_name -``` - -下方列出常见的 5 条 SQL 查询命令,以供参考: - -- 查看最新的 10 条数据 - -```SQL -SELECT * FROM your_table_name ORDER BY ts DESC LIMIT 10; -``` - -- 查询 `host` 为 `8.8.8.8` 的最新 10 条数据 - -```SQL -SELECT * FROM your_table_name WHERE host = '8.8.8.8' ORDER BY ts DESC LIMIT 10; -``` - -- 检索请求字段中有 `error` 或者 `404` 的最新 10 条数据。其中,`MATCH_ANY` 是 Apache Doris 全文检索的 SQL 语法,用于匹配参数中任一关键字。 - -```SQL -SELECT * FROM your_table_name WHERE message MATCH_ANY 'error 404' -ORDER BY ts DESC LIMIT 10; -``` - -- 检索请求字段中有 `image` 和 `faq` 的最新 10 条数据。其中,`MATCH_ALL` 是 Apache Doris 全文检索的 SQL 语法,用于匹配参数中所有关键字。 - -```SQL -SELECT * FROM your_table_name WHERE message MATCH_ALL 'image faq' -ORDER BY ts DESC LIMIT 10; -``` - -- 检索请求字段中有 `image` 和 `faq` 的最新 10 条数据。其中,`MATCH_PHRASE` 是 Apache Doris 全文检索的 SQL 语法,用于匹配参数中所有关键字,并且要求顺序一致。在下方例子中,`a image faq b` 能匹配,但是 `a faq image b` 不能匹配,因为 `image` 和 `faq` 的顺序与查询不一致。 - -```SQL -SELECT * FROM your_table_name WHERE message MATCH_PHRASE 'image faq' -ORDER BY ts DESC LIMIT 10; -``` - -**可视化日志分析** - -一些第三方厂商提供了基于 Apache Doris 的可视化日志分析开发平台,包含类 Kibana Discover 的日志检索分析界面,提供直观、易用的探索式日志分析交互。 - -![WebUI](/images/WebUI-CN.jpeg) - -- 支持全文检索和 SQL 两种模式 -- 支持时间框和直方图上选择查询日志的时间段 -- 支持信息丰富的日志明细展示,还可以展开成 JSON 或表格 -- 在日志数据上下文交互式点击增加和删除筛选条件 -- 搜索结果的字段 Top 值展示,便于发现异常值和进一步下钻分析 - -您可以联系 dev@doris.apache.org 获得更多帮助。 - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/gettingStarted/what-is-apache-doris.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/gettingStarted/what-is-apache-doris.md index 94bff83de8824..fa9f8dc01b2f8 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/gettingStarted/what-is-apache-doris.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/gettingStarted/what-is-apache-doris.md @@ -139,7 +139,7 @@ Apache Doris 查询引擎是向量化的查询引擎,所有的内存结构能 ![Doris 查询引擎是向量化](/images/getting-started/apache-doris-query-engine-2.png) -Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术, 可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 +Apache Doris 采用了自适应查询执行(Adaptive Query Execution)技术,可以根据 Runtime Statistics 来动态调整执行计划,比如通过 Runtime Filter 技术能够在运行时生成 Filter 推到 Probe 侧,并且能够将 Filter 自动穿透到 Probe 侧最底层的 Scan 节点,从而大幅减少 Probe 的数据量,加速 Join 性能。Apache Doris 的 Runtime Filter 支持 In/Min/Max/Bloom Filter。 在优化器方面,Apache Doris 使用 CBO 和 RBO 结合的优化策略,RBO 支持常量折叠、子查询改写、谓词下推等,CBO 支持 Join Reorder。目前 CBO 还在持续优化中,主要集中在更加精准的统计信息收集和推导,更加精准的代价模型预估等方面。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/log-storage-analysis.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/log-storage-analysis.md similarity index 99% rename from i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/log-storage-analysis.md rename to i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/log-storage-analysis.md index 9669a0ea06df1..e5c7adcac3165 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-2.0/gettingStarted/tutorials/log-storage-analysis.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "构建日志存储与分析平台", + "title": "日志存储与分析", "language": "zh-CN" } --- @@ -218,13 +218,13 @@ Apache Doris 对 Flexible Schema 的日志数据提供了几个方面的支持 更多关于分区分桶的信息,可参考 [数据划分](../../table-design/data-partitioning/basic-concepts)。 **配置压缩参数** -- 使用 zstd 压缩算法(`"compression" = "zstd"`), 提高数据压缩率。 +- 使用 zstd 压缩算法 (`"compression" = "zstd"`), 提高数据压缩率。 **配置 Compaction 参数** 按照以下说明配置 Compaction 参数: -- 使用 time_series 策略(`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 +- 使用 time_series 策略 (`"compaction_policy" = "time_series"`),以减轻写放大效应,对于高吞吐日志写入的资源写入很重要。 **建立和配置索引参数** diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md index d3f848618fd9a..186acb51820bd 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/query-acceleration/optimization-technology-principle/pipeline-execution-engine.md @@ -1,6 +1,6 @@ --- { - "title": "并行执行", + "title": "Pipeline 执行引擎", "language": "zh-CN", "toc_min_heading_level": 2, "toc_max_heading_level": 4 @@ -28,71 +28,71 @@ under the License. -Doris的并行执行模型是一种Pipeline 执行模型,主要参考了[Hyper](https://db.in.tum.de/~leis/papers/morsels.pdf)论文中Pipeline的实现方式,Pipeline 执行模型能够充分释放多核 CPU 的计算能力,并对 Doris 的查询线程的数目进行限制,解决 Doris 的执行线程膨胀的问题。它的具体设计、实现和效果可以参阅 [DSIP-027](DSIP-027: Support Pipeline Exec Engine - DORIS - Apache Software Foundation) 以及 [DSIP-035](DSIP-035: PipelineX Execution Engine - DORIS - Apache Software Foundation)。 -Doris 3.0 之后,Pipeline 执行模型彻底替换了原有的火山模型,基于Pipeline 执行模型,Doris 实现了 Query、DDL、DML 语句的并行处理。 +Doris 的并行执行模型是一种 Pipeline 执行模型,主要参考了[Hyper](https://db.in.tum.de/~leis/papers/morsels.pdf)论文中 Pipeline 的实现方式,Pipeline 执行模型能够充分释放多核 CPU 的计算能力,并对 Doris 的查询线程的数目进行限制,解决 Doris 的执行线程膨胀的问题。它的具体设计、实现和效果可以参阅 [DSIP-027](DSIP-027: Support Pipeline Exec Engine - DORIS - Apache Software Foundation) 以及 [DSIP-035](DSIP-035: PipelineX Execution Engine - DORIS - Apache Software Foundation)。 +Doris 3.0 之后,Pipeline 执行模型彻底替换了原有的火山模型,基于 Pipeline 执行模型,Doris 实现了 Query、DDL、DML 语句的并行处理。 ## 物理计划 -为了更好的理解Pipeline 执行模型,首先需要介绍一下物理查询计划中两个重要的概念:PlanFragment和PlanNode。我们使用下面这条SQL 作为例子: +为了更好的理解 Pipeline 执行模型,首先需要介绍一下物理查询计划中两个重要的概念:PlanFragment 和 PlanNode。我们使用下面这条 SQL 作为例子: ``` SELECT k1, SUM(v1) FROM A,B WHERE A.k2 = B.k2 GROUP BY k1 ORDER BY SUM(v1); ``` -FE 首先会把它翻译成下面这种逻辑计划,计划中每个节点就是一个PlanNode,每种Node的具体含义,可以参考查看物理计划的介绍。 +FE 首先会把它翻译成下面这种逻辑计划,计划中每个节点就是一个 PlanNode,每种 Node 的具体含义,可以参考查看物理计划的介绍。 ![pip_exec_1](/images/pip_exec_1.png) -由于Doris 是一个MPP的架构,每个查询都会尽可能的让所有的BE 都参与进来并行执行,来降低查询的延时。所以还需要将上述逻辑计划拆分为一个物理计划,拆分物理计划基本上就是在逻辑计划中插入了DataSink和ExchangeNode,通过这两个Node完成了数据在多个BE 之间的Shuffle。拆分完成后,每个PlanFragment 相当于包含了一部分PlanNode,可以作为一个独立的任务发送给BE,每个BE 完成了PlanFragment内包含的PlanNode的计算后,通过DataSink和ExchangeNode 这两个算子把数据shuffle到其他BE上来进行接下来的计算。 +由于 Doris 是一个 MPP 的架构,每个查询都会尽可能的让所有的 BE 都参与进来并行执行,来降低查询的延时。所以还需要将上述逻辑计划拆分为一个物理计划,拆分物理计划基本上就是在逻辑计划中插入了 DataSink 和 ExchangeNode,通过这两个 Node 完成了数据在多个 BE 之间的 Shuffle。拆分完成后,每个 PlanFragment 相当于包含了一部分 PlanNode,可以作为一个独立的任务发送给 BE,每个 BE 完成了 PlanFragment 内包含的 PlanNode 的计算后,通过 DataSink 和 ExchangeNode 这两个算子把数据 shuffle 到其他 BE 上来进行接下来的计算。 ![pip_exec_2](/images/pip_exec_2.png) -所以Doris的规划分为3层: -PLAN:执行计划,一个SQL会被执行规划器翻译成一个执行计划,之后执行计划会提供给执行引擎执行。 +所以 Doris 的规划分为 3 层: +PLAN:执行计划,一个 SQL 会被执行规划器翻译成一个执行计划,之后执行计划会提供给执行引擎执行。 -FRAGMENT:由于DORIS是一个分布式执行引擎。一个完整的执行计划会被切分为多个单机的执行片段。一个FRAGMENT表是一个完整的单机执行片段。多个FRAGMENT组合在一起,构成一个完整的PLAN。 +FRAGMENT:由于 DORIS 是一个分布式执行引擎。一个完整的执行计划会被切分为多个单机的执行片段。一个 FRAGMENT 表是一个完整的单机执行片段。多个 FRAGMENT 组合在一起,构成一个完整的 PLAN。 -PLAN NODE:算子,是执行计划的最小单位。一个FRAGMENT由多个算子构成。每一个算子负责一个实际的执行逻辑,比如聚合,连接等 +PLAN NODE:算子,是执行计划的最小单位。一个 FRAGMENT 由多个算子构成。每一个算子负责一个实际的执行逻辑,比如聚合,连接等 ## Pipeline 执行 -PlanFragment 是FE 发往BE 执行任务的最小单位。BE可能会收到同一个Query的多个不同的PlanFragment,每个PlanFragment都会被单独的处理。在收到PlanFragment 之后,BE会把PlanFragment 拆分为多个Pipeline,进而启动多个PipelineTask 来实现并行执行,提升查询效率。 +PlanFragment 是 FE 发往 BE 执行任务的最小单位。BE 可能会收到同一个 Query 的多个不同的 PlanFragment,每个 PlanFragment 都会被单独的处理。在收到 PlanFragment 之后,BE 会把 PlanFragment 拆分为多个 Pipeline,进而启动多个 PipelineTask 来实现并行执行,提升查询效率。 ![pip_exec_3](/images/pip_exec_3.png) ### Pipeline -一个Pipeline 有一个SourceOperator 和 一个SinkOperator 以及中间的多个其他Operator组成。SourceOperator 代表从外部读取数据,可以是一个表(OlapTable),也可以是一个Buffer(Exchange)。SinkOperator 表示数据的输出,输出可以是通过网络shuffle到别的节点,比如DataStreamSinkOperator,也可以是输出到HashTable,比如Agg算子,JoinBuildHashTable等。 +一个 Pipeline 有一个 SourceOperator 和 一个 SinkOperator 以及中间的多个其他 Operator 组成。SourceOperator 代表从外部读取数据,可以是一个表(OlapTable),也可以是一个 Buffer(Exchange)。SinkOperator 表示数据的输出,输出可以是通过网络 shuffle 到别的节点,比如 DataStreamSinkOperator,也可以是输出到 HashTable,比如 Agg 算子,JoinBuildHashTable 等。 ![pip_exec_4](/images/pip_exec_4.png) -多个Pipeline 之间实际是有依赖关系的,以JoinNode为例,他实际被拆分到了2个Pipeline 里。其中Pipeline-0是读取Exchange的数据,来构建HashTable;Pipeline-1 是从表里读取数据,来进行Probe。这2个Pipeline 之间是有关联关系的,只有Pipeline-0运行完毕之后才能执行Pipeline-1。这两者之间的依赖关系,称为Dependency。当Pipeline-0 运行完毕后,会调用Dependency的set_ready 方法通知Pipeline-1 可执行。 +多个 Pipeline 之间实际是有依赖关系的,以 JoinNode 为例,他实际被拆分到了 2 个 Pipeline 里。其中 Pipeline-0 是读取 Exchange 的数据,来构建 HashTable;Pipeline-1 是从表里读取数据,来进行 Probe。这 2 个 Pipeline 之间是有关联关系的,只有 Pipeline-0 运行完毕之后才能执行 Pipeline-1。这两者之间的依赖关系,称为 Dependency。当 Pipeline-0 运行完毕后,会调用 Dependency 的 set_ready 方法通知 Pipeline-1 可执行。 ### PipelineTask -Pipeline 实际还是一个逻辑概念,他并不是一个可执行的实体。在有了Pipeline之后,需要进一步的把Pipeline 实例化为多个PipelineTask。将需要读取的数据分配给不同的PipelineTask 最终实现并行处理。同一个Pipeline的多个PipelineTask 之间的Operator 完全相同,他们的区别在于Operator的状态不一样,比如读取的数据不一样,构建出的HashTable 不一样,这些不一样的状态,我们称之为LocalState。 -每个PipelineTask 最终都会被提交到一个线程池中作为独立的任务执行。在Dependency 这种触发机制下,可以更好的利用多核CPU,实现充分的并行。 +Pipeline 实际还是一个逻辑概念,他并不是一个可执行的实体。在有了 Pipeline 之后,需要进一步的把 Pipeline 实例化为多个 PipelineTask。将需要读取的数据分配给不同的 PipelineTask 最终实现并行处理。同一个 Pipeline 的多个 PipelineTask 之间的 Operator 完全相同,他们的区别在于 Operator 的状态不一样,比如读取的数据不一样,构建出的 HashTable 不一样,这些不一样的状态,我们称之为 LocalState。 +每个 PipelineTask 最终都会被提交到一个线程池中作为独立的任务执行。在 Dependency 这种触发机制下,可以更好的利用多核 CPU,实现充分的并行。 ### Operator -在大多数时候,Pipeline 中的每个Operator 都对应了一个PlanNode,但是有一些特殊的算子除外: -- JoinNode,被拆分为JoinBuildOperator和JoinProbeOperator -- AggNode 被拆分为AggSinkOperator和AggSourceOperator -- SortNode 被拆分为SortSinkOperator 和 SortSourceOperator -基本原理是,对于一些breaking 算子(需要把所有的数据都收集齐之后才能运算的算子),把灌入数据的部分拆分为Sink,然后把从这个算子里获取数据的部分称为Source。 +在大多数时候,Pipeline 中的每个 Operator 都对应了一个 PlanNode,但是有一些特殊的算子除外: +- JoinNode,被拆分为 JoinBuildOperator 和 JoinProbeOperator +- AggNode 被拆分为 AggSinkOperator 和 AggSourceOperator +- SortNode 被拆分为 SortSinkOperator 和 SortSourceOperator +基本原理是,对于一些 breaking 算子(需要把所有的数据都收集齐之后才能运算的算子),把灌入数据的部分拆分为 Sink,然后把从这个算子里获取数据的部分称为 Source。 ## Scan 并行化 -扫描数据是一个非常重的IO 操作,它需要从本地磁盘读取大量的数据(如果是数据湖的场景,就需要从HDFS或者S3中读取,延时更长),需要比较多的时间。所以我们在ScanOperator 中引入了并行扫描的技术,ScanOperator会动态的生成多个Scanner,每个Scanner 扫描100w-200w 行左右的数据,每个Scanner 在做数据扫描时,完成相应的数据解压、过滤等计算任务,然后把数据发送给一个DataQueue,供ScanOperator 读取。 +扫描数据是一个非常重的 IO 操作,它需要从本地磁盘读取大量的数据(如果是数据湖的场景,就需要从 HDFS 或者 S3 中读取,延时更长),需要比较多的时间。所以我们在 ScanOperator 中引入了并行扫描的技术,ScanOperator 会动态的生成多个 Scanner,每个 Scanner 扫描 100w-200w 行左右的数据,每个 Scanner 在做数据扫描时,完成相应的数据解压、过滤等计算任务,然后把数据发送给一个 DataQueue,供 ScanOperator 读取。 ![pip_exec_5](/images/pip_exec_5.png) -通过并行扫描的技术可以有效的避免由于分桶不合理或者数据倾斜导致某些ScanOperator 执行时间特别久,把整个查询的延时都拖慢的问题。 +通过并行扫描的技术可以有效的避免由于分桶不合理或者数据倾斜导致某些 ScanOperator 执行时间特别久,把整个查询的延时都拖慢的问题。 ## Local Shuffle -在Pipeline执行模型中,Local Exchange作为一个Pipeline Breaker出现,是在本地将数据重新分发至各个执行任务的技术。它把上游Pipeline输出的全部数据以某种方式(HASH / Round Robin)均匀分发到下游Pipeline的全部Task中。解决执行过程中的数据倾斜的问题,使执行模型不再受数据存储以及plan的限制。接下来我们举例来说明Local Exchange的工作逻辑。 -我们用上述例子中的Pipeline-1为例子进一步阐述Local Exchange如何可以避免数据倾斜。 +在 Pipeline 执行模型中,Local Exchange 作为一个 Pipeline Breaker 出现,是在本地将数据重新分发至各个执行任务的技术。它把上游 Pipeline 输出的全部数据以某种方式(HASH / Round Robin)均匀分发到下游 Pipeline 的全部 Task 中。解决执行过程中的数据倾斜的问题,使执行模型不再受数据存储以及 plan 的限制。接下来我们举例来说明 Local Exchange 的工作逻辑。 +我们用上述例子中的 Pipeline-1 为例子进一步阐述 Local Exchange 如何可以避免数据倾斜。 ![pip_exec_6](/images/pip_exec_6.png) -如上图所示,首先,通过在Pipeline 1中插入Local Exchange,我们把Pipeline 1进一步拆分成Pipeline 1-0和Pipeline 1-1。 -此时,我们不妨假设当前并发等于3(每个Pipeline有3个task),每个task读取存储层的一个bucket,而3个bucket中数据行数分别是1,1,7。则插入Local Exchange前后的执行变化如下: +如上图所示,首先,通过在 Pipeline 1 中插入 Local Exchange,我们把 Pipeline 1 进一步拆分成 Pipeline 1-0 和 Pipeline 1-1。 +此时,我们不妨假设当前并发等于 3(每个 Pipeline 有 3 个 task),每个 task 读取存储层的一个 bucket,而 3 个 bucket 中数据行数分别是 1,1,7。则插入 Local Exchange 前后的执行变化如下: ![pip_exec_7](/images/pip_exec_7.png) -从图右可以看出,HashJoin和Agg算子需要处理的数据量从(1,1,7)变成了(3,3,3)从而避免了数据倾斜。 -在Doris中,Local Exchange根据一系列规则来决定是否被规划,例如当查询耗时比较大的Join、聚合、窗口函数等算子需要被执行时,我们就需要使用Local Exchange来尽可能避免数据倾斜。 \ No newline at end of file +从图右可以看出,HashJoin 和 Agg 算子需要处理的数据量从 (1,1,7) 变成了 (3,3,3) 从而避免了数据倾斜。 +在 Doris 中,Local Exchange 根据一系列规则来决定是否被规划,例如当查询耗时比较大的 Join、聚合、窗口函数等算子需要被执行时,我们就需要使用 Local Exchange 来尽可能避免数据倾斜。 \ No newline at end of file diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/column-compression.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/column-compression.md index a6a4194b85a4d..7e56a5b6ac443 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/column-compression.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/column-compression.md @@ -1,6 +1,6 @@ --- { - "title": "按列压缩", + "title": "数据压缩", "language": "zh_CN" } --- @@ -44,10 +44,10 @@ Doris 支持多种压缩算法,每种算法在压缩率和解压速度之间 |-------------------------------|------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------| | **无压缩** | - 数据不进行压缩。 | 适用于不需要压缩的场景,例如数据已经被压缩或者存储空间不是问题的情况。 | | **LZ4** | - 压缩和解压速度非常快。
- 压缩比适中。 | 适用于对解压速度要求高的场景,如实时查询或高并发负载。 | -| **LZ4F (LZ4框架)** | - LZ4的扩展版本,支持更灵活的压缩配置。
- 速度快,压缩比适中。 | 适用于需要快速压缩并对配置有细粒度控制的场景。 | -| **LZ4HC (LZ4高压缩)** | - 相比LZ4有更高的压缩比,但压缩速度较慢。
- 解压速度与LZ4相当。 | 适用于需要更高压缩比的场景,同时仍然关注解压速度。 | +| **LZ4F (LZ4 框架)** | - LZ4 的扩展版本,支持更灵活的压缩配置。
- 速度快,压缩比适中。 | 适用于需要快速压缩并对配置有细粒度控制的场景。 | +| **LZ4HC (LZ4 高压缩)** | - 相比 LZ4 有更高的压缩比,但压缩速度较慢。
- 解压速度与 LZ4 相当。 | 适用于需要更高压缩比的场景,同时仍然关注解压速度。 | | **ZSTD (Zstandard)** | - 高压缩比,支持灵活的压缩级别调整。
- 即使在高压缩比下,解压速度仍然很快。 | 适用于对存储效率要求较高且需要平衡查询性能的场景。 | -| **Snappy** | - 设计重点是快速解压。
- 压缩比适中。 | 适用于对解压速度要求高且对CPU消耗低的场景。 | +| **Snappy** | - 设计重点是快速解压。
- 压缩比适中。 | 适用于对解压速度要求高且对 CPU 消耗低的场景。 | | **Zlib** | - 提供良好的压缩比与速度平衡。
- 与其他算法相比,压缩和解压速度较慢,但压缩比更高。 | 适用于对存储效率要求较高且对解压速度不敏感的场景,如归档和冷数据存储。 | diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/data-model/aggregate.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/data-model/aggregate.md index 8ea30edc81f66..79036e7c5f8f5 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/data-model/aggregate.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/data-model/aggregate.md @@ -88,7 +88,7 @@ DISTRIBUTED BY HASH(user_id) BUCKETS 10; * BITMAP_UNION:BIMTAP 类型的列的聚合方式,进行位图的并集聚合。 -:::info 提示: +:::info 提示: 如果以上的聚合方式无法满足业务需求,可以选择使用 agg_state 类型。 ::: @@ -129,7 +129,7 @@ SELECT * FROM example_tbl_agg; ## AGG_STATE -::: info 提示: +:::info 提示: AGG_STATE 是实验特性,建议在开发与测试环境中使用。 ::: diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/tiered-storage/overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/tiered-storage/overview.md index a0df890036e5a..2df6839366077 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/tiered-storage/overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/version-3.0/table-design/tiered-storage/overview.md @@ -1,6 +1,6 @@ --- { - "title": "分层存储", + "title": "冷热数据分层概述", "language": "zh-CN" } --- @@ -30,6 +30,6 @@ under the License. |--------------------|------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------| | **存算分离** | 用户具备部署存算分离的条件 | - 数据以单副本完全存储在对象存储中
- 通过本地缓存加速热数据访问
- 存储与计算资源独立扩展,显著降低存储成本 | | **本地分层** | 存算一体模式下,用户希望进一步优化本地存储资源 | - 支持将冷数据从 SSD 冷却到 HDD
- 充分利用本地存储层级特性,节省高性能存储成本 | -| **远程分层** | 存算一体模式下,使用廉价的对象存储或者 HDFS 进一步降低成本 | - 冷数据以单副本形式保存到对象存储或者 HDFS中
- 热数据继续使用本地存储
- 不能对一个表和本地分层混合使用 | +| **远程分层** | 存算一体模式下,使用廉价的对象存储或者 HDFS 进一步降低成本 | - 冷数据以单副本形式保存到对象存储或者 HDFS 中
- 热数据继续使用本地存储
- 不能对一个表和本地分层混合使用 | 通过上述模式,Doris 能够灵活适配用户的部署条件,实现查询效率与存储成本的平衡。 diff --git a/sidebars.json b/sidebars.json index d7aba92fcfcba..8c0c46d2e2a14 100644 --- a/sidebars.json +++ b/sidebars.json @@ -5,16 +5,8 @@ "label": "Getting Started", "collapsed": false, "items": [ - "gettingStarted/what-is-new", "gettingStarted/what-is-apache-doris", - "gettingStarted/quick-start", - { - "type": "category", - "label": "Tutorials", - "items": [ - "gettingStarted/tutorials/log-storage-analysis" - ] - } + "gettingStarted/quick-start" ] }, { @@ -31,8 +23,8 @@ "label": "Installation Preparation", "items": [ "install/preparation/env-checking", - "install/preparation/cluster-planning", - "install/preparation/os-checking" + "install/preparation/cluster-planning", + "install/preparation/os-checking" ] }, { @@ -73,7 +65,7 @@ }, { "type": "category", - "label": "Deployment on Cloud", + "label": "Deploying on Cloud", "items": [ "install/cluster-deployment/doris-on-aws" ] @@ -160,6 +152,7 @@ "data-operate/import/data-source/hdfs", "data-operate/import/data-source/amazon-s3", "data-operate/import/data-source/google-cloud-storage", + "data-operate/import/data-source/azure-storage", "data-operate/import/data-source/aliyun-oss", "data-operate/import/data-source/huawei-obs", "data-operate/import/data-source/tencent-cos", @@ -436,6 +429,7 @@ } ] }, + "log-storage-analysis", { "type": "category", "label": "Compute-Storage Decoupled", @@ -477,6 +471,16 @@ } ] }, + { + "type": "category", + "label": "Benchmark", + "collapsed": false, + "items": [ + "benchmark/ssb", + "benchmark/tpch", + "benchmark/tpcds" + ] + }, { "type": "category", "label": "Management", @@ -779,16 +783,6 @@ "admin-manual/compaction" ] }, - { - "type": "category", - "label": "Benchmark", - "collapsed": false, - "items": [ - "benchmark/ssb", - "benchmark/tpch", - "benchmark/tpcds" - ] - }, { "type": "category", "label": "Ecosystem", diff --git a/src/pages/index.tsx b/src/pages/index.tsx index 303820302b249..cbac55705c603 100644 --- a/src/pages/index.tsx +++ b/src/pages/index.tsx @@ -37,7 +37,7 @@ export default function Home(): JSX.Element { }, { label: Get started, - link: '/docs/gettingStarted/what-is-new', + link: '/docs/gettingStarted/what-is-apache-doris', type: 'ghost', }, { diff --git a/versioned_docs/version-1.2/data-operate/export/export_with_mysql_dump.md b/versioned_docs/version-1.2/data-operate/export/export_with_mysql_dump.md index ee2ce765ae1a5..75bae202e0b7f 100644 --- a/versioned_docs/version-1.2/data-operate/export/export_with_mysql_dump.md +++ b/versioned_docs/version-1.2/data-operate/export/export_with_mysql_dump.md @@ -1,6 +1,6 @@ --- { -"title": "Use mysqldump data to export table structure or data", +"title": "Using MySQL Dump", "language": "en" } --- diff --git a/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-hudi.md b/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-hudi.md deleted file mode 100644 index eea87af87d43f..0000000000000 --- a/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-hudi.md +++ /dev/null @@ -1,313 +0,0 @@ ---- -{ - "title": "Using Doris and Hudi", - "language": "en" -} - ---- - - - -As a new open data management architecture, the Data Lakehouse integrates the high performance and real-time capabilities of data warehouses with the low cost and flexibility of data lakes, helping users more conveniently meet various data processing and analysis needs. It has been increasingly applied in enterprise big data systems. - -In recent versions, Apache Doris has deepened its integration with data lakes and has evolved a mature Data Lakehouse solution. - -- Since version 0.15, Apache Doris has introduced Hive and Iceberg external tables, exploring the capabilities of combining with Apache Iceberg for data lakes. -- Starting from version 1.2, Apache Doris officially introduced the Multi-Catalog feature, enabling automatic metadata mapping and data access for various data sources, along with numerous performance optimizations for external data reading and query execution. It now fully possesses the ability to build a high-speed and user-friendly Lakehouse architecture. -- In version 2.1, Apache Doris' Data Lakehouse architecture was significantly enhanced, improving the reading and writing capabilities of mainstream data lake formats (Hudi, Iceberg, Paimon, etc.), introducing compatibility with multiple SQL dialects, and seamless migration from existing systems to Apache Doris. For data science and large-scale data reading scenarios, Doris integrated the Arrow Flight high-speed reading interface, achieving a 100-fold increase in data transfer efficiency. - -![Building lakehouse using doris and huid](/images/lakehouse-architecture-for-doris-and-hudi.png) - -## Apache Doris & Hudi - -[Apache Hudi](https://hudi.apache.org/) is currently one of the most popular open data lake formats and a transactional data lake management platform, supporting various mainstream query engines including Apache Doris. - -Apache Doris has also enhanced its ability to read Apache Hudi data tables: - -- Supports Copy on Write Table: Snapshot Query -- Supports Merge on Read Table: Snapshot Queries, Read Optimized Queries -- Supports Time Travel -- Supports Incremental Read - -With Apache Doris' high-performance query execution and Apache Hudi's real-time data management capabilities, efficient, flexible, and cost-effective data querying and analysis can be achieved. It also provides robust data lineage, auditing, and incremental processing functionalities. The combination of Apache Doris and Apache Hudi has been validated and promoted in real business scenarios by multiple community users: - -- Real-time data analysis and processing: Common scenarios such as real-time data updates and query analysis in industries like finance, advertising, and e-commerce require real-time data processing. Hudi enables real-time data updates and management while ensuring data consistency and reliability. Doris efficiently handles large-scale data query requests in real-time, meeting the demands of real-time data analysis and processing effectively when combined. -- Data lineage and auditing: For industries with high requirements for data security and accuracy like finance and healthcare, data lineage and auditing are crucial functionalities. Hudi offers Time Travel functionality for viewing historical data states, combined with Apache Doris' efficient querying capabilities, enabling quick analysis of data at any point in time for precise lineage and auditing. -- Incremental data reading and analysis: Large-scale data analysis often faces challenges of large data volumes and frequent updates. Hudi supports incremental data reading, allowing users to process only the changed data without full data updates. Additionally, Apache Doris' Incremental Read feature enhances this process, significantly improving data processing and analysis efficiency. -- Cross-data source federated queries: Many enterprises have complex data sources stored in different databases. Doris' Multi-Catalog feature supports automatic mapping and synchronization of various data sources, enabling federated queries across data sources. This greatly shortens the data flow path and enhances work efficiency for enterprises needing to retrieve and integrate data from multiple sources for analysis. - -This article will introduce readers to how to quickly set up a test and demonstration environment for Apache Doris + Apache Hudi in a Docker environment, and demonstrate various operations to help readers get started quickly. - -For more information, please refer to [Hudi Catalog](../../../lakehouse/datalake-analytics/hudi) - -## User Guide - -All scripts and code mentioned in this article can be obtained from this address: [https://github.com/apache/doris/tree/master/samples/datalake/hudi](https://github.com/apache/doris/tree/master/samples/datalake/hudi) - -### 01 Environment Preparation - -This article uses Docker Compose for deployment, with the following components and versions: - -| Component | Version | -| --- | --- | -| Apache Doris | Default 2.1.4, can be modified | -| Apache Hudi | 0.14 | -| Apache Spark | 3.4.2 | -| Apache Hive | 2.1.3 | -| MinIO | 2022-05-26T05-48-41Z | - -### 02 Environment Deployment - -1. Create a Docker network - - `sudo docker network create -d bridge hudi-net` - -2. Start all components - - `sudo ./start-hudi-compose.sh` - - > Note: Before starting, you can modify the `DORIS_PACKAGE` and `DORIS_DOWNLOAD_URL` in `start-hudi-compose.sh` to the desired Doris version. It is recommended to use version 2.1.4 or higher. - -3. After starting, you can use the following script to log in to Spark command line or Doris command line: - - ``` - -- Doris - sudo ./login-spark.sh - - -- Spark - sudo ./login-doris.sh - ``` - -### 03 Data Preparation - -Next, generate Hudi data through Spark. As shown in the code below, there is already a Hive table named `customer` in the cluster. You can create a Hudi table using this Hive table: - -``` --- ./login-spark.sh -spark-sql> use default; - --- create a COW table -spark-sql> CREATE TABLE customer_cow -USING hudi -TBLPROPERTIES ( - type = 'cow', - primaryKey = 'c_custkey', - preCombineField = 'c_name' -) -PARTITIONED BY (c_nationkey) -AS SELECT * FROM customer; - --- create a MOR table -spark-sql> CREATE TABLE customer_mor -USING hudi -TBLPROPERTIES ( - type = 'mor', - primaryKey = 'c_custkey', - preCombineField = 'c_name' -) -PARTITIONED BY (c_nationkey) -AS SELECT * FROM customer; -``` - -### 04 Data Query - -As shown below, a Catalog named `hudi` has been created in the Doris cluster (can be viewed using `SHOW CATALOGS`). The following is the creation statement for this Catalog: - -``` --- Already created, no need to execute again -CREATE CATALOG `hudi` PROPERTIES ( - "type"="hms", - 'hive.metastore.uris' = 'thrift://hive-metastore:9083', - "s3.access_key" = "minio", - "s3.secret_key" = "minio123", - "s3.endpoint" = "http://minio:9000", - "s3.region" = "us-east-1", - "use_path_style" = "true" -); -``` - -1. Manually refresh this Catalog to synchronize the created Hudi table: - - ``` - -- ./login-doris.sh - doris> REFRESH CATALOG hudi; - ``` - -2. Operations on data in Hudi using Spark are immediately visible in Doris without the need to refresh the Catalog. We insert a row of data into both COW and MOR tables using Spark: - - ``` - spark-sql> insert into customer_cow values (100, "Customer#000000100", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 25); - spark-sql> insert into customer_mor values (100, "Customer#000000100", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 25); - ``` - -3. Through Doris, you can directly query the latest inserted data: - - ``` - doris> use hudi.default; - doris> select * from customer_cow where c_custkey = 100; - doris> select * from customer_mor where c_custkey = 100; - ``` - -4. Insert data with c_custkey=32 that already exists using Spark, thus overwriting the existing data: - - ``` - spark-sql> insert into customer_cow values (32, "Customer#000000032_update", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 15); - spark-sql> insert into customer_mor values (32, "Customer#000000032_update", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 15); - ``` - -5. With Doris, you can query the updated data: - - ``` - doris> select * from customer_cow where c_custkey = 32; - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - doris> select * from customer_mor where c_custkey = 32; - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - ``` - -### 05 Incremental Read - -Incremental Read is one of the features provided by Hudi. With Incremental Read, users can obtain incremental data within a specified time range, enabling incremental processing of data. In this regard, Doris can query the changed data after inserting `c_custkey=100`. As shown below, we inserted a data with `c_custkey=32`: - -``` -doris> select * from customer_cow@incr('beginTime'='20240603015018572'); -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -spark-sql> select * from hudi_table_changes('customer_cow', 'latest_state', '20240603015018572'); - -doris> select * from customer_mor@incr('beginTime'='20240603015058442'); -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -spark-sql> select * from hudi_table_changes('customer_mor', 'latest_state', '20240603015058442'); -``` - -### 06 TimeTravel - -Doris supports querying specific snapshot versions of Hudi data, thereby enabling Time Travel functionality for data. First, you can query the commit history of two Hudi tables using Spark: - -``` -spark-sql> call show_commits(table => 'customer_cow', limit => 10); -20240603033556094 20240603033558249 commit 448833 0 1 1 183 0 0 -20240603015444737 20240603015446588 commit 450238 0 1 1 202 1 0 -20240603015018572 20240603015020503 commit 436692 1 0 1 1 0 0 -20240603013858098 20240603013907467 commit 44902033 100 0 25 18751 0 0 - -spark-sql> call show_commits(table => 'customer_mor', limit => 10); -20240603033745977 20240603033748021 deltacommit 1240 0 1 1 0 0 0 -20240603015451860 20240603015453539 deltacommit 1434 0 1 1 1 1 0 -20240603015058442 20240603015100120 deltacommit 436691 1 0 1 1 0 0 -20240603013918515 20240603013922961 deltacommit 44904040 100 0 25 18751 0 0 -``` - -Next, using Doris, you can execute `c_custkey=32` to query the data snapshot before the data insertion. As shown below, the data with `c_custkey=32` has not been updated yet: - -> Note: Time Travel syntax is currently not supported by the new optimizer. You need to first execute `set enable_nereids_planner=false;` to disable the new optimizer. This issue will be fixed in future versions. - -``` -doris> select * from customer_cow for time as of '20240603015018572' where c_custkey = 32 or c_custkey = 100; -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | 15 | -| 100 | Customer#000000100 | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 25 | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ --- compare with spark-sql -spark-sql> select * from customer_mor timestamp as of '20240603015018572' where c_custkey = 32 or c_custkey = 100; - -doris> select * from customer_mor for time as of '20240603015058442' where c_custkey = 32 or c_custkey = 100; -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| 100 | Customer#000000100 | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 25 | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | 15 | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -spark-sql> select * from customer_mor timestamp as of '20240603015058442' where c_custkey = 32 or c_custkey = 100; -``` - -## Query Optimization - -Data in Apache Hudi can be roughly divided into two categories - baseline data and incremental data. Baseline data is typically merged Parquet files, while incremental data refers to data increments generated by INSERT, UPDATE, or DELETE operations. Baseline data can be read directly, while incremental data needs to be read through Merge on Read. - -For querying Hudi COW tables or Read Optimized queries on MOR tables, the data belongs to baseline data and can be directly read using Doris' native Parquet Reader, providing fast query responses. For incremental data, Doris needs to access Hudi's Java SDK through JNI calls. To achieve optimal query performance, Apache Doris divides the data in a query into baseline and incremental data parts and reads them using the aforementioned methods. - -To verify this optimization approach, we can use the EXPLAIN statement to see how many baseline and incremental data are present in a query example below. For a COW table, all 101 data shards are baseline data (`hudiNativeReadSplits=101/101`), so the COW table can be entirely read directly using Doris' Parquet Reader, resulting in the best query performance. For a ROW table, most data shards are baseline data (`hudiNativeReadSplits=100/101`), with one shard being incremental data, which also provides good query performance. - -``` --- COW table is read natively -doris> explain select * from customer_cow where c_custkey = 32; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_cow | -| predicates: (c_custkey[#5] = 32) | -| inputSplitNum=101, totalFileSize=45338886, scanRanges=101 | -| partition=26/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=101/101 | - --- MOR table: because only the base file contains `c_custkey = 32` that is updated, 100 splits are read natively, while the split with log file is read by JNI. -doris> explain select * from customer_mor where c_custkey = 32; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_mor | -| predicates: (c_custkey[#5] = 32) | -| inputSplitNum=101, totalFileSize=45340731, scanRanges=101 | -| partition=26/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=100/101 | -``` - -You can further observe the changes in Hudi baseline data and incremental data by performing some deletion operations using Spark: - -``` --- Use delete statement to see more differences -spark-sql> delete from customer_cow where c_custkey = 64; -doris> explain select * from customer_cow where c_custkey = 64; - -spark-sql> delete from customer_mor where c_custkey = 64; -doris> explain select * from customer_mor where c_custkey = 64; -``` - -Additionally, you can reduce the data volume further by using partition conditions for partition pruning to improve query speed. In the example below, partition pruning is done using the partition condition `c_nationkey=15`, allowing the query request to access data from only one partition (`partition=1/26`). - -``` --- customer_xxx is partitioned by c_nationkey, we can use the partition column to prune data -doris> explain select * from customer_mor where c_custkey = 64 and c_nationkey = 15; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_mor | -| predicates: (c_custkey[#5] = 64), (c_nationkey[#12] = 15) | -| inputSplitNum=4, totalFileSize=1798186, scanRanges=4 | -| partition=1/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=3/4 | -``` diff --git a/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md b/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md deleted file mode 100644 index 3a6159407bc8d..0000000000000 --- a/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md +++ /dev/null @@ -1,304 +0,0 @@ ---- -{ - "title": "Using Doris and Iceberg", - "language": "en" -} - ---- - - - -As a new open data management architecture, the Data Lakehouse integrates the high performance and real-time capabilities of a data warehouse with the low cost and flexibility of a data lake, helping users more conveniently meet various data processing and analysis needs. It has been increasingly applied in enterprise big data systems. - -In recent versions, Apache Doris has deepened its integration with data lakes and now offers a mature Data Lakehouse solution. - -- Since version 0.15, Apache Doris has introduced Hive and Iceberg external tables, exploring the capabilities of combining with Apache Iceberg for data lakes. -- Starting from version 1.2, Apache Doris officially introduced the Multi-Catalog feature, enabling automatic metadata mapping and data access for various data sources, along with numerous performance optimizations for external data reading and query execution. It now fully supports building a high-speed and user-friendly Lakehouse architecture. -- In version 2.1, Apache Doris further strengthened its Data Lakehouse architecture, enhancing the reading and writing capabilities of mainstream data lake formats (Hudi, Iceberg, Paimon, etc.), introducing compatibility with multiple SQL dialects, and seamless migration from existing systems to Apache Doris. For data science and large-scale data reading scenarios, Doris integrated the Arrow Flight high-speed reading interface, achieving a 100x improvement in data transfer efficiency. - -![Building Lakehouse using Doris and Iceberg](/images/lakehouse-architecture-for-doris-and-iceberg.png) - -## Apache Doris & Iceberg - -Apache Iceberg is an open-source, high-performance, and highly reliable data lake table format that enables the analysis and management of massive-scale data. It supports various mainstream query engines, including Apache Doris, is compatible with HDFS and various object cloud storage, and features ACID compliance, schema evolution, advanced filtering, hidden partitioning, and partition layout evolution to ensure high-performance queries, data reliability, consistency, and flexibility with features like time travel and version rollback. - -Apache Doris provides native support for several core features of Iceberg: - -- Supports multiple Iceberg Catalog types such as Hive Metastore, Hadoop, REST, Glue, Google Dataproc Metastore, DLF, etc. -- Native support for Iceberg V1/V2 table formats and reading of Position Delete, Equality Delete files. -- Supports querying Iceberg table snapshot history through table functions. -- Supports Time Travel functionality. -- Native support for the Iceberg table engine. It allows Apache Doris to directly create, manage, and write data to Iceberg tables. It supports comprehensive partition Transform functions, providing capabilities like hidden partitioning and partition layout evolution. - -Users can quickly build an efficient Data Lakehouse solution based on Apache Doris + Apache Iceberg to flexibly address various real-time data analysis and processing needs. - -- Use the high-performance query engine of Doris to perform data analysis by associating Iceberg table data and other data sources, building a **unified federated data analysis platform**. -- Manage and build Iceberg tables directly through Doris, complete data cleaning, processing, and writing to Iceberg tables in Doris, building a **unified data processing platform for data lakes**. -- Share Doris data with other upstream and downstream systems for further processing through the Iceberg table engine, building a **unified open data storage platform**. - -In the future, Apache Iceberg will serve as one of the native table engines for Apache Doris, providing more comprehensive analysis and management functions for lake-formatted data. Apache Doris will also gradually support more advanced features of Apache Iceberg, including Update/Delete/Merge, sorting during write-back, incremental data reading, metadata management, etc., to jointly build a unified, high-performance, real-time data lake platform. - -For more information, please refer to [Iceberg Catalog](../../../lakehouse/datalake-analytics/iceberg) - -## User Guide - -This document mainly explains how to quickly set up an Apache Doris + Apache Iceberg testing & demonstration environment in a Docker environment and demonstrate the usage of various functions. - -All scripts and code mentioned in this document can be obtained from this address: [https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon](https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon) - -### 01 Environment Preparation - -This document uses Docker Compose for deployment, with the following components and versions: - -| Component | Version | -| --- | --- | -| Apache Doris | Default 2.1.5, can be modified | -| Apache Iceberg | 1.4.3 | -| MinIO | RELEASE.2024-04-29T09-56-05Z | - -### 02 Environment Deployment - -1. Start all components - - `bash ./start_all.sh` - -2. After starting, you can use the following script to log in to the Doris command line: - - ``` - -- login doris - bash ./start_doris_client.sh - ``` - -### 03 Create Iceberg Table - -After logging into the Doris command line, an Iceberg Catalog named Iceberg has already been created in the Doris cluster (can be viewed by `SHOW CATALOGS`/`SHOW CREATE CATALOG iceberg`). The following is the creation statement for this Catalog: - -``` --- Already created -CREATE CATALOG `iceberg` PROPERTIES ( - "type" = "iceberg", - "iceberg.catalog.type" = "rest", - "warehouse" = "s3://warehouse/", - "uri" = "http://rest:8181", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "http://minio:9000" -); -``` - -Create a database and an Iceberg table in the Iceberg Catalog: - -``` -mysql> SWITCH iceberg; -Query OK, 0 rows affected (0.00 sec) - -mysql> CREATE DATABASE nyc; -Query OK, 0 rows affected (0.12 sec) - -mysql> CREATE TABLE iceberg.nyc.taxis - ( - vendor_id BIGINT, - trip_id BIGINT, - trip_distance FLOAT, - fare_amount DOUBLE, - store_and_fwd_flag STRING, - ts DATETIME - ) - PARTITION BY LIST (vendor_id, DAY(ts)) () - PROPERTIES ( - "compression-codec" = "zstd", - "write-format" = "parquet" - ); -Query OK, 0 rows affected (0.15 sec) -``` - -### 04 Data Insertion - -Insert data into the Iceberg table: - -``` -mysql> INSERT INTO iceberg.nyc.taxis - VALUES - (1, 1000371, 1.8, 15.32, 'N', '2024-01-01 9:15:23'), - (2, 1000372, 2.5, 22.15, 'N', '2024-01-02 12:10:11'), - (2, 1000373, 0.9, 9.01, 'N', '2024-01-01 3:25:15'), - (1, 1000374, 8.4, 42.13, 'Y', '2024-01-03 7:12:33'); -Query OK, 4 rows affected (1.61 sec) -{'status':'COMMITTED', 'txnId':'10085'} -``` - -Create an Iceberg table using `CREATE TABLE AS SELECT`: - -``` -mysql> CREATE TABLE iceberg.nyc.taxis2 AS SELECT * FROM iceberg.nyc.taxis; -Query OK, 6 rows affected (0.25 sec) -{'status':'COMMITTED', 'txnId':'10088'} -``` - -### 05 Data Query - -- Simple query - - ``` - mysql> SELECT * FROM iceberg.nyc.taxis; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | - | 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - | 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 4 rows in set (0.37 sec) - - mysql> SELECT * FROM iceberg.nyc.taxis2; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | - | 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - | 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 4 rows in set (0.35 sec) - ``` - -- Partition pruning - - ``` - mysql> SELECT * FROM iceberg.nyc.taxis where vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 1 row in set (0.06 sec) - - mysql> EXPLAIN VERBOSE SELECT * FROM iceberg.nyc.taxis where vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'; - - .... - | 0:VICEBERG_SCAN_NODE(71) - | table: taxis - | predicates: (ts[#5] < '2024-01-02 00:00:00'), (vendor_id[#0] = 2), (ts[#5] >= '2024-01-01 00:00:00') - | inputSplitNum=1, totalFileSize=3539, scanRanges=1 - | partition=1/0 - | backends: - | 10002 - | s3://warehouse/wh/nyc/taxis/data/vendor_id=2/ts_day=2024-01-01/40e6ca404efa4a44-b888f23546d3a69c_5708e229-2f3d-4b68-a66b-44298a9d9815-0.zstd.parquet start: 0 length: 3539 - | cardinality=6, numNodes=1 - | pushdown agg=NONE - | icebergPredicatePushdown= - | ref(name="ts") < 1704153600000000 - | ref(name="vendor_id") == 2 - | ref(name="ts") >= 1704067200000000 - .... - ``` - - By examining the result of the `EXPLAIN VERBOSE` statement, it can be seen that the predicate condition `vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'` ultimately only hits one partition (`partition=1/0`). - - It can also be observed that because a partition Transform function `DAY(ts)` was specified when creating the table, the original value in the data `2024-01-01 03:25:15.000000` will be transformed into the partition information in the file directory `ts_day=2024-01-01`. - -### 06 Time Travel - -Let's insert a few more rows of data: - -``` -INSERT INTO iceberg.nyc.taxis VALUES (1, 1000375, 8.8, 55.55, 'Y', '2024-01-01 8:10:22'), (3, 1000376, 7.4, 32.35, 'N', '2024-01-02 1:14:45'); -Query OK, 2 rows affected (0.17 sec) -{'status':'COMMITTED', 'txnId':'10086'} - -mysql> SELECT * FROM iceberg.nyc.taxis; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 3 | 1000376 | 7.4 | 32.35 | N | 2024-01-02 01:14:45.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 1 | 1000375 | 8.8 | 55.55 | Y | 2024-01-01 08:10:22.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -6 rows in set (0.11 sec) -``` - -Use the `iceberg_meta` table function to query the snapshot information of the table: - -``` -mysql> select * from iceberg_meta("table" = "iceberg.nyc.taxis", "query_type" = "snapshots"); -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| committed_at | snapshot_id | parent_id | operation | manifest_list | summary | -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| 2024-07-29 03:38:22 | 8483933166442433486 | -1 | append | s3://warehouse/wh/nyc/taxis/metadata/snap-8483933166442433486-1-5f7b7736-8022-4ba1-9db2-51ae7553be4d.avro | {"added-data-files":"4","added-records":"4","added-files-size":"14156","changed-partition-count":"4","total-records":"4","total-files-size":"14156","total-data-files":"4","total-delete-files":"0","total-position-deletes":"0","total-equality-deletes":"0"} | -| 2024-07-29 03:40:23 | 4726331391239920914 | 8483933166442433486 | append | s3://warehouse/wh/nyc/taxis/metadata/snap-4726331391239920914-1-6aa3d142-6c9c-4553-9c04-08ad4d49a4ea.avro | {"added-data-files":"2","added-records":"2","added-files-size":"7078","changed-partition-count":"2","total-records":"6","total-files-size":"21234","total-data-files":"6","total-delete-files":"0","total-position-deletes":"0","total-equality-deletes":"0"} | -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -2 rows in set (0.07 sec) -``` - -Query a specified snapshot using the `FOR VERSION AS OF` statement: - -``` -mysql> SELECT * FROM iceberg.nyc.taxis FOR VERSION AS OF 8483933166442433486; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.05 sec) - -mysql> SELECT * FROM iceberg.nyc.taxis FOR VERSION AS OF 4726331391239920914; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000375 | 8.8 | 55.55 | Y | 2024-01-01 08:10:22.000000 | -| 3 | 1000376 | 7.4 | 32.35 | N | 2024-01-02 01:14:45.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -6 rows in set (0.04 sec) -``` - -Query a specified snapshot using the `FOR TIME AS OF` statement: - -``` -mysql> SELECT * FROM iceberg.nyc.taxis FOR TIME AS OF "2024-07-29 03:38:23"; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.04 sec) - -mysql> SELECT * FROM iceberg.nyc.taxis FOR TIME AS OF "2024-07-29 03:40:22"; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.05 sec) -``` diff --git a/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md b/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md deleted file mode 100644 index deb4e5f472423..0000000000000 --- a/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md +++ /dev/null @@ -1,341 +0,0 @@ ---- -{ - "title": "Using Doris and LakeSoul", - "language": "en" -} - ---- - - - -As a new open data management architecture, the Data Lakehouse integrates the high performance and real-time capabilities of data warehouses with the low cost and flexibility of data lakes, helping users more conveniently meet various data processing and analysis needs. It has been increasingly applied in enterprise big data systems. - -In recent versions, Apache Doris has deepened its integration with data lakes and has evolved a mature Data Lakehouse solution. - -- Since version 0.15, Apache Doris has introduced Hive and Iceberg external tables, exploring the capabilities of combining with Apache Iceberg for data lakes. -- Starting from version 1.2, Apache Doris officially introduced the Multi-Catalog feature, enabling automatic metadata mapping and data access for various data sources, along with numerous performance optimizations for external data reading and query execution. It now fully possesses the ability to build a high-speed and user-friendly Lakehouse architecture. -- In version 2.1, Apache Doris' Data Lakehouse architecture was significantly enhanced, improving the reading and writing capabilities of mainstream data lake formats (Hudi, Iceberg, Paimon, etc.), introducing compatibility with multiple SQL dialects, and seamless migration from existing systems to Apache Doris. For data science and large-scale data reading scenarios, Doris integrated the Arrow Flight high-speed reading interface, achieving a 100-fold increase in data transfer efficiency. - -![Building lakehouse using doris and lakesoul](/images/lakehouse-architecture-for-doris-and-lakesoul.png) - -# Apache Doris & LakeSoul - -LakeSoul is a cloud-native lakehouse framework developed by DMetaSoul and donated to the Linux Foundation AI & Data Foundation in May 2023. It features high scalability in metadata management, ACID transactions, efficient and flexible upsert operations, schema evolution, and batch-stream integrated processing . - -With Apache Doris' high-performance query engine and LakeSoul's efficient data management, users can achieve: - -- Real-time data ingestion into the lake: Leveraging LakeSoul's architecture, data can be ingested into the lake with high efficiency and low latency, supporting various data update capabilities including aggregation, deduplication, and partial column updates. -- High-performance data processing and analysis: LakeSoul's capabilities such as batch-stream integrated processing and schema evolution can be seamlessly integrated with Doris' powerful query engine, enabling fast querying and analysis responses for lake data. - -In the future, Apache Doris will gradually support more advanced features of LakeSoul, such as CDC stream synchronization and automatic schema evolution, to jointly build a unified, high-performance, real-time lakehouse platform. - -This article will explain how to quickly set up an Apache Doris + LakeSoul testing & demonstration environment and demonstrate the usage of various features, showcasing the integration and benefits of using both systems in a lakehouse architecture . - -For more information, please refer to [LakeSoul Catalog](../../../lakehouse/datalake-analytics/lakesoul) - - -## User Guide - -All scripts and code mentioned in this article can be obtained from this address: [https://github.com/apache/doris/tree/master/samples/datalake/lakesoul](https://github.com/apache/doris/tree/master/samples/datalake/lakesoul) - -### 01 Environment Preparation - -This article uses Docker Compose for deployment, with the following components and versions: - -| Component | Version | -| --- | --- | -| Apache Doris | Default 3.0.2| -| LakeSoul | 2.6.1 | -| Postgres | 14.5 | -| Apache Spark | 3.3.1 | -| Apache Flink | 1.17 | -| MinIO | RELEASE.2024-04-29T09-56-05Z | - - -### 02 Environment Deployment - -1. Start all components - - ``` - bash ./start_all.sh - ``` - -2. After starting, you can use the following script to log in to the Doris command line: - - ``` - -- login doris - bash ./start_doris_client.sh - ``` - - -### 03 Data Query - -As shown below, a Catalog named `lakesoul` has been created in the Doris cluster (can be viewed using `SHOW CATALOGS`). The following is the creation statement for this Catalog: - -```sql - -- Already created - CREATE CATALOG `lakesoul` PROPERTIES ( - 'type'='lakesoul', - 'lakesoul.pg.username'='lakesoul_test', - 'lakesoul.pg.password'='lakesoul_test', - 'lakesoul.pg.url'='jdbc:postgresql://lakesoul-meta-pg:5432/lakesoul_test?stringtype=unspecified', - 'minio.endpoint'='http://minio:9000', - 'minio.access_key'='admin', - 'minio.secret_key'='password' - ); - - ``` - The LakeSoul Table `lakesoul.tpch.customer` has already been loaded into Doris. Query the data in Doris. -- Simple query - ```sql - Doris> use `lakesoul`.`tpch`; - Database changed - - Doris> show tables; - +---------------------+ - | Tables_in_tpch | - +---------------------+ - | customer_from_spark | - +---------------------+ - 1 row in set (0.00 sec) - - Doris> select * from customer_from_spark where c_nationkey = 1 order by c_custkey limit 4; - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - | 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | - | 14 | Customer#000000014 | KXkletMlL2JQEA | 1 | 11-845-129-3851 | 5266.30 | FURNITURE | , ironic packages across the unus | - | 30 | Customer#000000030 | nJDsELGAavU63Jl0c5NKsKfL8rIJQQkQnYL2QJY | 1 | 11-764-165-5076 | 9321.01 | BUILDING | lithely final requests. furiously unusual account | - | 59 | Customer#000000059 | zLOCP0wh92OtBihgspOGl4 | 1 | 11-355-584-3112 | 3458.60 | MACHINERY | ously final packages haggle blithely after the express deposits. furiou | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - 4 rows in set (3.14 sec) - - Doris> select * from customer_from_spark where c_nationkey = 1 order by c_custkey desc limit 4; - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - | 14983 | Customer#000014983 | ERN3vq5Fvt4DL | 1 | 11-424-279-1846 | 841.22 | AUTOMOBILE | furiously slyly special foxes. express theodolites cajole carefully. special dinos haggle pinto | - | 14968 | Customer#000014968 | ,sykKTZBzVFl7ito1750v2TRYwmkRl2nvqGHwmx | 1 | 11-669-222-9657 | 6106.77 | HOUSEHOLD | ts above the furiously even deposits haggle across | - | 14961 | Customer#000014961 | JEIORcsBp6RpLYH 9gNdDyWJ | 1 | 11-490-251-5554 | 4006.35 | HOUSEHOLD | quests detect carefully final platelets! quickly final frays haggle slyly blithely final acc | - | 14940 | Customer#000014940 | bNoyCxPuqSwPLjbqjEUNGN d0mSP | 1 | 11-242-677-1085 | 8829.48 | HOUSEHOLD | ver the quickly express braids. regular dependencies haggle fluffily quickly i | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - 4 rows in set (0.10 sec) - ``` - -- Partition pruning -Doris can perform partition pruning on LakeSoul and speed up the query process through native reading. We can check this through `explain verbose`. - - ```sql - Doris> explain verbose select * from customer_from_spark where c_nationkey < 3; - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | Explain String(Old Planner) | - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | PLAN FRAGMENT 0 | - | OUTPUT EXPRS: | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_custkey` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_name` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_address` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_nationkey` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_phone` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_acctbal` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_mktsegment` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_comment` | - | PARTITION: UNPARTITIONED | - | | - | HAS_COLO_PLAN_NODE: false | - | | - | VRESULT SINK | - | MYSQL_PROTOCAL | - | | - | 1:VEXCHANGE | - | offset: 0 | - | tuple ids: 0 | - | | - | PLAN FRAGMENT 1 | - | | - | PARTITION: RANDOM | - | | - | HAS_COLO_PLAN_NODE: false | - | | - | STREAM DATA SINK | - | EXCHANGE ID: 01 | - | UNPARTITIONED | - | | - | 0:VplanNodeName | - | table: customer_from_spark | - | predicates: (`c_nationkey` < 3) | - | inputSplitNum=12, totalFileSize=0, scanRanges=12 | - | partition=0/0 | - | backends: | - | 10002 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00000-0568c817-d6bc-4fa1-bb9e-b311069b131c_00000.c000.parquet start: 0 length: 0 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00001-d99a8fe6-61ab-4285-94da-2f84f8746a8a_00001.c000.parquet start: 0 length: 0 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00002-8a8e396f-685f-4b0f-87fa-e2a3fe5be87e_00002.c000.parquet start: 0 length: 0 | - | ... other 8 files ... | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=0/part-00003-d5b598cd-5bed-412c-a26f-bb4bc9c937bc_00003.c000.parquet start: 0 length: 0 | - | numNodes=1 | - | pushdown agg=NONE | - | tuple ids: 0 | - | | - | Tuples: | - | TupleDescriptor{id=0, tbl=customer_from_spark} | - | SlotDescriptor{id=0, col=c_custkey, colUniqueId=0, type=int, nullable=false, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=1, col=c_name, colUniqueId=1, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=2, col=c_address, colUniqueId=2, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=3, col=c_nationkey, colUniqueId=3, type=int, nullable=false, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=4, col=c_phone, colUniqueId=4, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=5, col=c_acctbal, colUniqueId=5, type=decimalv3(15,2), nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=6, col=c_mktsegment, colUniqueId=6, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=7, col=c_comment, colUniqueId=7, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - 57 rows in set (0.03 sec) - - ``` - - - By examining the result of the `EXPLAIN VERBOSE` statement, it can be seen that the predicate condition `c_nationkey < 3` ultimately only hits one partition (`partition=0/0`). - -### 04 CDC Table Support - -Launch Flink CDC Job to sync mysql table. The mysql table is loaded when launching docker compose. - -``` -bash start_flink_cdc_job.sh -``` - -```sql -Start flink-cdc job... -SLF4J: Class path contains multiple SLF4J bindings. -SLF4J: Found binding in [jar:file:/opt/flink/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class] -SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class] -SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. -SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory] -Loading class `com.mysql.jdbc.Driver'. This is deprecated. The new driver class is `com.mysql.cj.jdbc.Driver'. The driver is automatically registered via the SPI and manual loading of the driver class is generally unnecessary. -Job has been submitted with JobID d1b3641dcd1ad85c6b373d49b1867e68 - -``` - - -Flink CDC Job will be launched. We can check the process of launching at `doris client` by recreate the lakesoul catalog. After the Flink CDC Job has been launched, we can see the syncing LakeSoul CDC table at `doris client`. - -```sql -Doris> show tables; -+---------------------+ -| Tables_in_tpch | -+---------------------+ -| customer_from_spark | -+---------------------+ -2 rows in set (0.00 sec) - - -Doris> drop catalog if exists lakesoul; -Query OK, 0 rows affected (0.00 sec) - -Doris> create catalog `lakesoul` properties ('type'='lakesoul', 'lakesoul.pg.username'='lakesoul_test', 'lakesoul.pg.password'='lakesoul_test', 'lakesoul.pg.url'='jdbc:postgresql://lakesoul-meta-pg:5432/lakesoul_test?stringtype=unspecified', 'minio.endpoint'='http://minio:9000', 'minio.access_key'='admin', 'minio.secret_key'='password'); -Query OK, 0 rows affected (0.01 sec) - -Doris> show tables; -+---------------------+ -| Tables_in_tpch | -+---------------------+ -| customer | -| customer_from_spark | -+---------------------+ -2 rows in set (0.00 sec) - -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -9 rows in set (1.09 sec) - -``` - -Enter the `mysql client` and try to modify data. - -``` -bash start_mysql_client.sh -``` - -Try update row from `mysql client`. - -```sql -mysql> update customer set c_acctbal=2211.26 where c_custkey=1; -Query OK, 1 row affected (0.01 sec) -Rows matched: 1 Changed: 1 Warnings: 0 -``` - -Back to `doris client` and check the data changing. - -```sql -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 2211.26 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -9 rows in set (0.11 sec) - -``` - -Try delete row from `mysql client`. - -```sql -mysql> delete from customer where c_custkey = 2; -Query OK, 1 row affected (0.01 sec) -``` - -Back to `doris client` and check the data changing. - -```sql -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 2211.26 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -8 rows in set (0.11 sec) - -``` diff --git a/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-paimon.md b/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-paimon.md deleted file mode 100644 index 26aba30e9623e..0000000000000 --- a/versioned_docs/version-1.2/gettingStarted/tutorials/building-lakehouse/doris-paimon.md +++ /dev/null @@ -1,270 +0,0 @@ ---- -{ - "title": "Using Doris and Paimon", - "language": "en" -} - ---- - - - -As a new open data management architecture, the Data Lakehouse integrates the high performance and real-time capabilities of data warehouses with the low cost and flexibility of data lakes, helping users more conveniently meet various data processing and analysis needs. It has been increasingly applied in enterprise big data systems. - -In recent versions, Apache Doris has deepened its integration with data lakes and has evolved a mature Data Lakehouse solution. - -- Since version 0.15, Apache Doris has introduced Hive and Iceberg external tables, exploring the capabilities of combining with Apache Iceberg for data lakes. -- Starting from version 1.2, Apache Doris officially introduced the Multi-Catalog feature, achieving automatic metadata mapping and data access for various data sources, along with many performance optimizations for external data reading and query execution. It now fully possesses the ability to build a high-speed and user-friendly Lakehouse architecture. -- In version 2.1, Apache Doris' Data Lakehouse architecture was significantly enhanced, strengthening the reading and writing capabilities of mainstream data lake formats (Hudi, Iceberg, Paimon, etc.), introducing compatibility with multiple SQL dialects, and seamless migration from existing systems to Apache Doris. For data science and large-scale data reading scenarios, Doris integrated the Arrow Flight high-speed reading interface, achieving a 100-fold improvement in data transfer efficiency. - -![Building lakehouse using Doris and Paimon](/images/lakehouse-architecture-for-doris-and-paimon.png) - -## Apache Doris & Paimon - -Apache Paimon is a data lake format that innovatively combines the advantages of data lake formats and LSM structures, successfully introducing efficient real-time streaming update capabilities into data lake architecture. This enables Paimon to efficiently manage data and perform real-time analysis, providing strong support for building real-time Data Lakehouse architecture. - -To fully leverage Paimon's capabilities and improve query efficiency for Paimon data, Apache Doris provides native support for several of Paimon's latest features: - -- Supports various types of Paimon Catalogs such as Hive Metastore and FileSystem. -- Native support for Paimon 0.6's Primary Key Table Read Optimized feature. -- Native support for Paimon 0.8's Primary Key Table Deletion Vector feature. - -With Apache Doris' high-performance query engine and Apache Paimon's efficient real-time streaming update capabilities, users can achieve: - -- Real-time data ingestion into the lake: Leveraging Paimon's LSM-Tree model, data ingestion into the lake can be reduced to a minute-level timeliness. Additionally, Paimon supports various data update capabilities including aggregation, deduplication, and partial column updates, making data flow more flexible and efficient. -- High-performance data processing and analysis: Paimon's technologies such as Append Only Table, Read Optimized, and Deletion Vector can be seamlessly integrated with Doris' powerful query engine, enabling fast querying and analysis responses for lake data. - -In the future, Apache Doris will gradually support more advanced features of Apache Paimon, including Time Travel and incremental data reading, to jointly build a unified, high-performance, real-time lakehouse platform. - -This article will explain how to quickly set up an Apache Doris + Apache Paimon testing & demonstration environment in a Docker environment and demonstrate the usage of various features. - -For more information, please refer to [Paimon Catalog](../../../lakehouse/datalake-analytics/paimon.md) - -## User Guide - -All scripts and code mentioned in this article can be obtained from the following address: [https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon](https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon) - -### 01 Environment Preparation - -This article uses Docker Compose for deployment, with the following components and versions: - -| Component | Version | -| --- | --- | -| Apache Doris | Default 2.1.5, can be modified | -| Apache Paimon | 0.8 | -| Apache Flink | 1.18 | -| MinIO | RELEASE.2024-04-29T09-56-05Z | - -### 02 Environment Deployment - -1. Start all components - - `bash ./start_all.sh` - -2. After starting, you can use the following scripts to log in to the Flink command line or Doris command line: - - ``` - -- login flink - bash ./start_flink_client.sh - - -- login doris - bash ./start_doris_client.sh - ``` - -### 03 Data Preparation - -After logging into the Flink command line, you can see a pre-built table. The table already contains some data that can be viewed using Flink SQL. - -``` -Flink SQL> use paimon.db_paimon; -[INFO] Execute statement succeed. - -Flink SQL> show tables; -+------------+ -| table name | -+------------+ -| customer | -+------------+ -1 row in set - -Flink SQL> show create table customer; -+------------------------------------------------------------------------+ -| result | -+------------------------------------------------------------------------+ -| CREATE TABLE `paimon`.`db_paimon`.`customer` ( - `c_custkey` INT NOT NULL, - `c_name` VARCHAR(25), - `c_address` VARCHAR(40), - `c_nationkey` INT NOT NULL, - `c_phone` CHAR(15), - `c_acctbal` DECIMAL(12, 2), - `c_mktsegment` CHAR(10), - `c_comment` VARCHAR(117), - CONSTRAINT `PK_c_custkey_c_nationkey` PRIMARY KEY (`c_custkey`, `c_nationkey`) NOT ENFORCED -) PARTITIONED BY (`c_nationkey`) -WITH ( - 'bucket' = '1', - 'path' = 's3://warehouse/wh/db_paimon.db/customer', - 'deletion-vectors.enabled' = 'true' -) - | -+-------------------------------------------------------------------------+ -1 row in set - -Flink SQL> desc customer; -+--------------+----------------+-------+-----------------------------+--------+-----------+ -| name | type | null | key | extras | watermark | -+--------------+----------------+-------+-----------------------------+--------+-----------+ -| c_custkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | -| c_name | VARCHAR(25) | TRUE | | | | -| c_address | VARCHAR(40) | TRUE | | | | -| c_nationkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | -| c_phone | CHAR(15) | TRUE | | | | -| c_acctbal | DECIMAL(12, 2) | TRUE | | | | -| c_mktsegment | CHAR(10) | TRUE | | | | -| c_comment | VARCHAR(117) | TRUE | | | | -+--------------+----------------+-------+-----------------------------+--------+-----------+ -8 rows in set - -Flink SQL> select * from customer order by c_custkey limit 4; -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platel... | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic... | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic,... | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tl... | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious ... | -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -4 rows in set -``` - -### 04 Data Query - -As shown below, a Catalog named `paimon` has been created in the Doris cluster (can be viewed using SHOW CATALOGS). The following is the statement for creating this Catalog: - -``` --- 已创建,无需执行 -CREATE CATALOG `paimon` PROPERTIES ( - "type" = "paimon", - "warehouse" = "s3://warehouse/wh/", - "s3.endpoint"="http://minio:9000", - "s3.access_key"="admin", - "s3.secret_key"="password", - "s3.region"="us-east-1" -); -``` - -You can query Paimon's data in Doris: - -``` -mysql> use paimon.db_paimon; -Reading table information for completion of table and column names -You can turn off this feature to get a quicker startup with -A - -Database changed -mysql> show tables; -+---------------------+ -| Tables_in_db_paimon | -+---------------------+ -| customer | -+---------------------+ -1 row in set (0.00 sec) - -mysql> select * from customer order by c_custkey limit 4; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -4 rows in set (1.89 sec) -``` - -### 05 Read Incremental Data - -You can update the data in the Paimon table using Flink SQL: - -``` -Flink SQL> update customer set c_address='c_address_update' where c_nationkey = 1; -[INFO] Submitting SQL update statement to the cluster... -[INFO] SQL update statement has been successfully submitted to the cluster: -Job ID: ff838b7b778a94396b332b0d93c8f7ac -``` - -After the Flink SQL execution is complete, you can directly view the latest data in Doris: - -``` -mysql> select * from customer where c_nationkey=1 limit 2; -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| 3 | Customer#000000003 | c_address_update | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 513 | Customer#000000513 | c_address_update | 1 | 11-861-303-6887 | 955.37 | HOUSEHOLD | press along the quickly regular instructions. regular requests against the carefully ironic s | -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -2 rows in set (0.19 sec) -``` - -### Benchmark - -We conducted a simple test on the TPCDS 1000 dataset in Paimon (0.8) version, using Apache Doris 2.1.5 version and Trino 422 version, both with the Primary Key Table Read Optimized feature enabled. - -![](/images/quick-start/lakehouse-paimon-benchmark.PNG) - -From the test results, it can be seen that Doris' average query performance on the standard static test set is 3-5 times that of Trino. In the future, we will optimize the Deletion Vector to further improve query efficiency in real business scenarios. - -## Query Optimization - -For baseline data, after introducing the Primary Key Table Read Optimized feature in Apache Paimon version 0.6, the query engine can directly access the underlying Parquet/ORC files, significantly improving the reading efficiency of baseline data. For unmerged incremental data (data increments generated by INSERT, UPDATE, or DELETE), they can be read through Merge-on-Read. In addition, Paimon introduced the Deletion Vector feature in version 0.8, which further enhances the query engine's efficiency in reading incremental data. -Apache Doris supports reading Deletion Vector through native Reader and performing Merge on Read. We demonstrate the query methods for baseline data and incremental data in a query using Doris' EXPLAIN statement. - -``` -mysql> explain verbose select * from customer where c_nationkey < 3; -+------------------------------------------------------------------------------------------------------------------------------------------------+ -| Explain String(Nereids Planner) | -+------------------------------------------------------------------------------------------------------------------------------------------------+ -| ............... | -| | -| 0:VPAIMON_SCAN_NODE(68) | -| table: customer | -| predicates: (c_nationkey[#3] < 3) | -| inputSplitNum=4, totalFileSize=238324, scanRanges=4 | -| partition=3/0 | -| backends: | -| 10002 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-15cee5b7-1bd7-42ca-9314-56d92c62c03b-0.orc start: 0 length: 66600 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-5d50255a-2215-4010-b976-d5dc656f3444-0.orc start: 0 length: 44501 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=2/bucket-0/data-e98fb7ef-ec2b-4ad5-a496-713cb9481d56-0.orc start: 0 length: 64059 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=0/bucket-0/data-431be05d-50fa-401f-9680-d646757d0f95-0.orc start: 0 length: 63164 | -| cardinality=18751, numNodes=1 | -| pushdown agg=NONE | -| paimonNativeReadSplits=4/4 | -| PaimonSplitStats: | -| SplitStat [type=NATIVE, rowCount=1542, rawFileConvertable=true, hasDeletionVector=true] | -| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | -| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | -| tuple ids: 0 -| ............... | | -+------------------------------------------------------------------------------------------------------------------------------------------------+ -67 rows in set (0.23 sec) -``` - -It can be seen that the table just updated by Flink SQL contains 4 shards, and all shards can be accessed through Native Reader (paimonNativeReadSplits=4/4). In addition, the hasDeletionVector property of the first shard is true, indicating that the shard has a corresponding Deletion Vector, and data will be filtered according to the Deletion Vector when reading. - diff --git a/versioned_docs/version-2.0/data-operate/export/export-with-mysql-dump.md b/versioned_docs/version-2.0/data-operate/export/export-with-mysql-dump.md index 51f12f41e4917..1c8961f7468cc 100644 --- a/versioned_docs/version-2.0/data-operate/export/export-with-mysql-dump.md +++ b/versioned_docs/version-2.0/data-operate/export/export-with-mysql-dump.md @@ -1,6 +1,6 @@ --- { -"title": "Exporting Data or Table Structures", +"title": "Using MySQL Dump", "language": "en" } --- diff --git a/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-hudi.md b/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-hudi.md deleted file mode 100644 index eea87af87d43f..0000000000000 --- a/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-hudi.md +++ /dev/null @@ -1,313 +0,0 @@ ---- -{ - "title": "Using Doris and Hudi", - "language": "en" -} - ---- - - - -As a new open data management architecture, the Data Lakehouse integrates the high performance and real-time capabilities of data warehouses with the low cost and flexibility of data lakes, helping users more conveniently meet various data processing and analysis needs. It has been increasingly applied in enterprise big data systems. - -In recent versions, Apache Doris has deepened its integration with data lakes and has evolved a mature Data Lakehouse solution. - -- Since version 0.15, Apache Doris has introduced Hive and Iceberg external tables, exploring the capabilities of combining with Apache Iceberg for data lakes. -- Starting from version 1.2, Apache Doris officially introduced the Multi-Catalog feature, enabling automatic metadata mapping and data access for various data sources, along with numerous performance optimizations for external data reading and query execution. It now fully possesses the ability to build a high-speed and user-friendly Lakehouse architecture. -- In version 2.1, Apache Doris' Data Lakehouse architecture was significantly enhanced, improving the reading and writing capabilities of mainstream data lake formats (Hudi, Iceberg, Paimon, etc.), introducing compatibility with multiple SQL dialects, and seamless migration from existing systems to Apache Doris. For data science and large-scale data reading scenarios, Doris integrated the Arrow Flight high-speed reading interface, achieving a 100-fold increase in data transfer efficiency. - -![Building lakehouse using doris and huid](/images/lakehouse-architecture-for-doris-and-hudi.png) - -## Apache Doris & Hudi - -[Apache Hudi](https://hudi.apache.org/) is currently one of the most popular open data lake formats and a transactional data lake management platform, supporting various mainstream query engines including Apache Doris. - -Apache Doris has also enhanced its ability to read Apache Hudi data tables: - -- Supports Copy on Write Table: Snapshot Query -- Supports Merge on Read Table: Snapshot Queries, Read Optimized Queries -- Supports Time Travel -- Supports Incremental Read - -With Apache Doris' high-performance query execution and Apache Hudi's real-time data management capabilities, efficient, flexible, and cost-effective data querying and analysis can be achieved. It also provides robust data lineage, auditing, and incremental processing functionalities. The combination of Apache Doris and Apache Hudi has been validated and promoted in real business scenarios by multiple community users: - -- Real-time data analysis and processing: Common scenarios such as real-time data updates and query analysis in industries like finance, advertising, and e-commerce require real-time data processing. Hudi enables real-time data updates and management while ensuring data consistency and reliability. Doris efficiently handles large-scale data query requests in real-time, meeting the demands of real-time data analysis and processing effectively when combined. -- Data lineage and auditing: For industries with high requirements for data security and accuracy like finance and healthcare, data lineage and auditing are crucial functionalities. Hudi offers Time Travel functionality for viewing historical data states, combined with Apache Doris' efficient querying capabilities, enabling quick analysis of data at any point in time for precise lineage and auditing. -- Incremental data reading and analysis: Large-scale data analysis often faces challenges of large data volumes and frequent updates. Hudi supports incremental data reading, allowing users to process only the changed data without full data updates. Additionally, Apache Doris' Incremental Read feature enhances this process, significantly improving data processing and analysis efficiency. -- Cross-data source federated queries: Many enterprises have complex data sources stored in different databases. Doris' Multi-Catalog feature supports automatic mapping and synchronization of various data sources, enabling federated queries across data sources. This greatly shortens the data flow path and enhances work efficiency for enterprises needing to retrieve and integrate data from multiple sources for analysis. - -This article will introduce readers to how to quickly set up a test and demonstration environment for Apache Doris + Apache Hudi in a Docker environment, and demonstrate various operations to help readers get started quickly. - -For more information, please refer to [Hudi Catalog](../../../lakehouse/datalake-analytics/hudi) - -## User Guide - -All scripts and code mentioned in this article can be obtained from this address: [https://github.com/apache/doris/tree/master/samples/datalake/hudi](https://github.com/apache/doris/tree/master/samples/datalake/hudi) - -### 01 Environment Preparation - -This article uses Docker Compose for deployment, with the following components and versions: - -| Component | Version | -| --- | --- | -| Apache Doris | Default 2.1.4, can be modified | -| Apache Hudi | 0.14 | -| Apache Spark | 3.4.2 | -| Apache Hive | 2.1.3 | -| MinIO | 2022-05-26T05-48-41Z | - -### 02 Environment Deployment - -1. Create a Docker network - - `sudo docker network create -d bridge hudi-net` - -2. Start all components - - `sudo ./start-hudi-compose.sh` - - > Note: Before starting, you can modify the `DORIS_PACKAGE` and `DORIS_DOWNLOAD_URL` in `start-hudi-compose.sh` to the desired Doris version. It is recommended to use version 2.1.4 or higher. - -3. After starting, you can use the following script to log in to Spark command line or Doris command line: - - ``` - -- Doris - sudo ./login-spark.sh - - -- Spark - sudo ./login-doris.sh - ``` - -### 03 Data Preparation - -Next, generate Hudi data through Spark. As shown in the code below, there is already a Hive table named `customer` in the cluster. You can create a Hudi table using this Hive table: - -``` --- ./login-spark.sh -spark-sql> use default; - --- create a COW table -spark-sql> CREATE TABLE customer_cow -USING hudi -TBLPROPERTIES ( - type = 'cow', - primaryKey = 'c_custkey', - preCombineField = 'c_name' -) -PARTITIONED BY (c_nationkey) -AS SELECT * FROM customer; - --- create a MOR table -spark-sql> CREATE TABLE customer_mor -USING hudi -TBLPROPERTIES ( - type = 'mor', - primaryKey = 'c_custkey', - preCombineField = 'c_name' -) -PARTITIONED BY (c_nationkey) -AS SELECT * FROM customer; -``` - -### 04 Data Query - -As shown below, a Catalog named `hudi` has been created in the Doris cluster (can be viewed using `SHOW CATALOGS`). The following is the creation statement for this Catalog: - -``` --- Already created, no need to execute again -CREATE CATALOG `hudi` PROPERTIES ( - "type"="hms", - 'hive.metastore.uris' = 'thrift://hive-metastore:9083', - "s3.access_key" = "minio", - "s3.secret_key" = "minio123", - "s3.endpoint" = "http://minio:9000", - "s3.region" = "us-east-1", - "use_path_style" = "true" -); -``` - -1. Manually refresh this Catalog to synchronize the created Hudi table: - - ``` - -- ./login-doris.sh - doris> REFRESH CATALOG hudi; - ``` - -2. Operations on data in Hudi using Spark are immediately visible in Doris without the need to refresh the Catalog. We insert a row of data into both COW and MOR tables using Spark: - - ``` - spark-sql> insert into customer_cow values (100, "Customer#000000100", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 25); - spark-sql> insert into customer_mor values (100, "Customer#000000100", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 25); - ``` - -3. Through Doris, you can directly query the latest inserted data: - - ``` - doris> use hudi.default; - doris> select * from customer_cow where c_custkey = 100; - doris> select * from customer_mor where c_custkey = 100; - ``` - -4. Insert data with c_custkey=32 that already exists using Spark, thus overwriting the existing data: - - ``` - spark-sql> insert into customer_cow values (32, "Customer#000000032_update", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 15); - spark-sql> insert into customer_mor values (32, "Customer#000000032_update", "jD2xZzi", "25-430-914-2194", 3471.59, "BUILDING", "cial ideas. final, furious requests", 15); - ``` - -5. With Doris, you can query the updated data: - - ``` - doris> select * from customer_cow where c_custkey = 32; - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - doris> select * from customer_mor where c_custkey = 32; - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - | 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | - +-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ - ``` - -### 05 Incremental Read - -Incremental Read is one of the features provided by Hudi. With Incremental Read, users can obtain incremental data within a specified time range, enabling incremental processing of data. In this regard, Doris can query the changed data after inserting `c_custkey=100`. As shown below, we inserted a data with `c_custkey=32`: - -``` -doris> select * from customer_cow@incr('beginTime'='20240603015018572'); -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -spark-sql> select * from hudi_table_changes('customer_cow', 'latest_state', '20240603015018572'); - -doris> select * from customer_mor@incr('beginTime'='20240603015058442'); -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -| 32 | Customer#000000032_update | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 15 | -+-----------+---------------------------+-----------+-----------------+-----------+--------------+-------------------------------------+-------------+ -spark-sql> select * from hudi_table_changes('customer_mor', 'latest_state', '20240603015058442'); -``` - -### 06 TimeTravel - -Doris supports querying specific snapshot versions of Hudi data, thereby enabling Time Travel functionality for data. First, you can query the commit history of two Hudi tables using Spark: - -``` -spark-sql> call show_commits(table => 'customer_cow', limit => 10); -20240603033556094 20240603033558249 commit 448833 0 1 1 183 0 0 -20240603015444737 20240603015446588 commit 450238 0 1 1 202 1 0 -20240603015018572 20240603015020503 commit 436692 1 0 1 1 0 0 -20240603013858098 20240603013907467 commit 44902033 100 0 25 18751 0 0 - -spark-sql> call show_commits(table => 'customer_mor', limit => 10); -20240603033745977 20240603033748021 deltacommit 1240 0 1 1 0 0 0 -20240603015451860 20240603015453539 deltacommit 1434 0 1 1 1 1 0 -20240603015058442 20240603015100120 deltacommit 436691 1 0 1 1 0 0 -20240603013918515 20240603013922961 deltacommit 44904040 100 0 25 18751 0 0 -``` - -Next, using Doris, you can execute `c_custkey=32` to query the data snapshot before the data insertion. As shown below, the data with `c_custkey=32` has not been updated yet: - -> Note: Time Travel syntax is currently not supported by the new optimizer. You need to first execute `set enable_nereids_planner=false;` to disable the new optimizer. This issue will be fixed in future versions. - -``` -doris> select * from customer_cow for time as of '20240603015018572' where c_custkey = 32 or c_custkey = 100; -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | 15 | -| 100 | Customer#000000100 | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 25 | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ --- compare with spark-sql -spark-sql> select * from customer_mor timestamp as of '20240603015018572' where c_custkey = 32 or c_custkey = 100; - -doris> select * from customer_mor for time as of '20240603015058442' where c_custkey = 32 or c_custkey = 100; -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| c_custkey | c_name | c_address | c_phone | c_acctbal | c_mktsegment | c_comment | c_nationkey | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -| 100 | Customer#000000100 | jD2xZzi | 25-430-914-2194 | 3471.59 | BUILDING | cial ideas. final, furious requests | 25 | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | 15 | -+-----------+--------------------+---------------------------------------+-----------------+-----------+--------------+--------------------------------------------------+-------------+ -spark-sql> select * from customer_mor timestamp as of '20240603015058442' where c_custkey = 32 or c_custkey = 100; -``` - -## Query Optimization - -Data in Apache Hudi can be roughly divided into two categories - baseline data and incremental data. Baseline data is typically merged Parquet files, while incremental data refers to data increments generated by INSERT, UPDATE, or DELETE operations. Baseline data can be read directly, while incremental data needs to be read through Merge on Read. - -For querying Hudi COW tables or Read Optimized queries on MOR tables, the data belongs to baseline data and can be directly read using Doris' native Parquet Reader, providing fast query responses. For incremental data, Doris needs to access Hudi's Java SDK through JNI calls. To achieve optimal query performance, Apache Doris divides the data in a query into baseline and incremental data parts and reads them using the aforementioned methods. - -To verify this optimization approach, we can use the EXPLAIN statement to see how many baseline and incremental data are present in a query example below. For a COW table, all 101 data shards are baseline data (`hudiNativeReadSplits=101/101`), so the COW table can be entirely read directly using Doris' Parquet Reader, resulting in the best query performance. For a ROW table, most data shards are baseline data (`hudiNativeReadSplits=100/101`), with one shard being incremental data, which also provides good query performance. - -``` --- COW table is read natively -doris> explain select * from customer_cow where c_custkey = 32; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_cow | -| predicates: (c_custkey[#5] = 32) | -| inputSplitNum=101, totalFileSize=45338886, scanRanges=101 | -| partition=26/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=101/101 | - --- MOR table: because only the base file contains `c_custkey = 32` that is updated, 100 splits are read natively, while the split with log file is read by JNI. -doris> explain select * from customer_mor where c_custkey = 32; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_mor | -| predicates: (c_custkey[#5] = 32) | -| inputSplitNum=101, totalFileSize=45340731, scanRanges=101 | -| partition=26/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=100/101 | -``` - -You can further observe the changes in Hudi baseline data and incremental data by performing some deletion operations using Spark: - -``` --- Use delete statement to see more differences -spark-sql> delete from customer_cow where c_custkey = 64; -doris> explain select * from customer_cow where c_custkey = 64; - -spark-sql> delete from customer_mor where c_custkey = 64; -doris> explain select * from customer_mor where c_custkey = 64; -``` - -Additionally, you can reduce the data volume further by using partition conditions for partition pruning to improve query speed. In the example below, partition pruning is done using the partition condition `c_nationkey=15`, allowing the query request to access data from only one partition (`partition=1/26`). - -``` --- customer_xxx is partitioned by c_nationkey, we can use the partition column to prune data -doris> explain select * from customer_mor where c_custkey = 64 and c_nationkey = 15; -| 0:VHUDI_SCAN_NODE(68) | -| table: customer_mor | -| predicates: (c_custkey[#5] = 64), (c_nationkey[#12] = 15) | -| inputSplitNum=4, totalFileSize=1798186, scanRanges=4 | -| partition=1/26 | -| cardinality=1, numNodes=1 | -| pushdown agg=NONE | -| hudiNativeReadSplits=3/4 | -``` diff --git a/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md b/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md deleted file mode 100644 index c4f3d3438fdf9..0000000000000 --- a/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-iceberg.md +++ /dev/null @@ -1,470 +0,0 @@ ---- -{ - "title": "Using Doris and Iceberg", - "language": "en" -} - ---- - - - -As a new open data management architecture, the Data Lakehouse integrates the high performance and real-time capabilities of a data warehouse with the low cost and flexibility of a data lake, helping users more conveniently meet various data processing and analysis needs. It has been increasingly applied in enterprise big data systems. - -In recent versions, Apache Doris has deepened its integration with data lakes and now offers a mature Data Lakehouse solution. - -- Since version 0.15, Apache Doris has introduced Hive and Iceberg external tables, exploring the capabilities of combining with Apache Iceberg for data lakes. -- Starting from version 1.2, Apache Doris officially introduced the Multi-Catalog feature, enabling automatic metadata mapping and data access for various data sources, along with numerous performance optimizations for external data reading and query execution. It now fully supports building a high-speed and user-friendly Lakehouse architecture. -- In version 2.1, Apache Doris further strengthened its Data Lakehouse architecture, enhancing the reading and writing capabilities of mainstream data lake formats (Hudi, Iceberg, Paimon, etc.), introducing compatibility with multiple SQL dialects, and seamless migration from existing systems to Apache Doris. For data science and large-scale data reading scenarios, Doris integrated the Arrow Flight high-speed reading interface, achieving a 100x improvement in data transfer efficiency. - -![Building Lakehouse using Doris and Iceberg](/images/lakehouse-architecture-for-doris-and-iceberg.png) - -## Apache Doris & Iceberg - -Apache Iceberg is an open-source, high-performance, and highly reliable data lake table format that enables the analysis and management of massive-scale data. It supports various mainstream query engines, including Apache Doris, is compatible with HDFS and various object cloud storage, and features ACID compliance, schema evolution, advanced filtering, hidden partitioning, and partition layout evolution to ensure high-performance queries, data reliability, consistency, and flexibility with features like time travel and version rollback. - -Apache Doris provides native support for several core features of Iceberg: - -- Supports multiple Iceberg Catalog types such as Hive Metastore, Hadoop, REST, Glue, Google Dataproc Metastore, DLF, etc. -- Native support for Iceberg V1/V2 table formats and reading of Position Delete, Equality Delete files. -- Supports querying Iceberg table snapshot history through table functions. -- Supports Time Travel functionality. -- Native support for the Iceberg table engine. It allows Apache Doris to directly create, manage, and write data to Iceberg tables. It supports comprehensive partition Transform functions, providing capabilities like hidden partitioning and partition layout evolution. - -Users can quickly build an efficient Data Lakehouse solution based on Apache Doris + Apache Iceberg to flexibly address various real-time data analysis and processing needs. - -- Use the high-performance query engine of Doris to perform data analysis by associating Iceberg table data and other data sources, building a **unified federated data analysis platform**. -- Manage and build Iceberg tables directly through Doris, complete data cleaning, processing, and writing to Iceberg tables in Doris, building a **unified data processing platform for data lakes**. -- Share Doris data with other upstream and downstream systems for further processing through the Iceberg table engine, building a **unified open data storage platform**. - -In the future, Apache Iceberg will serve as one of the native table engines for Apache Doris, providing more comprehensive analysis and management functions for lake-formatted data. Apache Doris will also gradually support more advanced features of Apache Iceberg, including Update/Delete/Merge, sorting during write-back, incremental data reading, metadata management, etc., to jointly build a unified, high-performance, real-time data lake platform. - -For more information, please refer to [Iceberg Catalog](../../../lakehouse/datalake-analytics/iceberg) - -## User Guide - -This document mainly explains how to quickly set up an Apache Doris + Apache Iceberg testing & demonstration environment in a Docker environment and demonstrate the usage of various functions. - -All scripts and code mentioned in this document can be obtained from this address: [https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon](https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon) - -### 01 Environment Preparation - -This document uses Docker Compose for deployment, with the following components and versions: - -| Component | Version | -| --- | --- | -| Apache Doris | Default 2.1.5, can be modified | -| Apache Iceberg | 1.4.3 | -| MinIO | RELEASE.2024-04-29T09-56-05Z | - -### 02 Environment Deployment - -1. Start all components - - `bash ./start_all.sh` - -2. After starting, you can use the following script to log in to the Doris command line: - - ``` - -- login doris - bash ./start_doris_client.sh - ``` - -### 03 Create Iceberg Table - -After logging into the Doris command line, an Iceberg Catalog named Iceberg has already been created in the Doris cluster (can be viewed by `SHOW CATALOGS`/`SHOW CREATE CATALOG iceberg`). The following is the creation statement for this Catalog: - -``` --- Already created -CREATE CATALOG `iceberg` PROPERTIES ( - "type" = "iceberg", - "iceberg.catalog.type" = "rest", - "warehouse" = "s3://warehouse/", - "uri" = "http://rest:8181", - "s3.access_key" = "admin", - "s3.secret_key" = "password", - "s3.endpoint" = "http://minio:9000" -); -``` - -Create a database and an Iceberg table in the Iceberg Catalog: - -``` -mysql> SWITCH iceberg; -Query OK, 0 rows affected (0.00 sec) - -mysql> CREATE DATABASE nyc; -Query OK, 0 rows affected (0.12 sec) - -mysql> CREATE TABLE iceberg.nyc.taxis - ( - vendor_id BIGINT, - trip_id BIGINT, - trip_distance FLOAT, - fare_amount DOUBLE, - store_and_fwd_flag STRING, - ts DATETIME - ) - PARTITION BY LIST (vendor_id, DAY(ts)) () - PROPERTIES ( - "compression-codec" = "zstd", - "write-format" = "parquet" - ); -Query OK, 0 rows affected (0.15 sec) -``` - -### 04 Data Insertion - -Insert data into the Iceberg table: - -``` -mysql> INSERT INTO iceberg.nyc.taxis - VALUES - (1, 1000371, 1.8, 15.32, 'N', '2024-01-01 9:15:23'), - (2, 1000372, 2.5, 22.15, 'N', '2024-01-02 12:10:11'), - (2, 1000373, 0.9, 9.01, 'N', '2024-01-01 3:25:15'), - (1, 1000374, 8.4, 42.13, 'Y', '2024-01-03 7:12:33'); -Query OK, 4 rows affected (1.61 sec) -{'status':'COMMITTED', 'txnId':'10085'} -``` - -Create an Iceberg table using `CREATE TABLE AS SELECT`: - -``` -mysql> CREATE TABLE iceberg.nyc.taxis2 AS SELECT * FROM iceberg.nyc.taxis; -Query OK, 6 rows affected (0.25 sec) -{'status':'COMMITTED', 'txnId':'10088'} -``` - -### 05 Data Query - -- Simple query - - ``` - mysql> SELECT * FROM iceberg.nyc.taxis; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | - | 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - | 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 4 rows in set (0.37 sec) - - mysql> SELECT * FROM iceberg.nyc.taxis2; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | - | 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - | 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 4 rows in set (0.35 sec) - ``` - -- Partition pruning - - ``` - mysql> SELECT * FROM iceberg.nyc.taxis where vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'; - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - | 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | - +-----------+---------+---------------+-------------+--------------------+----------------------------+ - 1 row in set (0.06 sec) - - mysql> EXPLAIN VERBOSE SELECT * FROM iceberg.nyc.taxis where vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'; - - .... - | 0:VICEBERG_SCAN_NODE(71) - | table: taxis - | predicates: (ts[#5] < '2024-01-02 00:00:00'), (vendor_id[#0] = 2), (ts[#5] >= '2024-01-01 00:00:00') - | inputSplitNum=1, totalFileSize=3539, scanRanges=1 - | partition=1/0 - | backends: - | 10002 - | s3://warehouse/wh/nyc/taxis/data/vendor_id=2/ts_day=2024-01-01/40e6ca404efa4a44-b888f23546d3a69c_5708e229-2f3d-4b68-a66b-44298a9d9815-0.zstd.parquet start: 0 length: 3539 - | cardinality=6, numNodes=1 - | pushdown agg=NONE - | icebergPredicatePushdown= - | ref(name="ts") < 1704153600000000 - | ref(name="vendor_id") == 2 - | ref(name="ts") >= 1704067200000000 - .... - ``` - - By examining the result of the `EXPLAIN VERBOSE` statement, it can be seen that the predicate condition `vendor_id = 2 and ts >= '2024-01-01' and ts < '2024-01-02'` ultimately only hits one partition (`partition=1/0`). - - It can also be observed that because a partition Transform function `DAY(ts)` was specified when creating the table, the original value in the data `2024-01-01 03:25:15.000000` will be transformed into the partition information in the file directory `ts_day=2024-01-01`. - -### 06 Time Travel - -Let's insert a few more rows of data: - -``` -INSERT INTO iceberg.nyc.taxis VALUES (1, 1000375, 8.8, 55.55, 'Y', '2024-01-01 8:10:22'), (3, 1000376, 7.4, 32.35, 'N', '2024-01-02 1:14:45'); -Query OK, 2 rows affected (0.17 sec) -{'status':'COMMITTED', 'txnId':'10086'} - -mysql> SELECT * FROM iceberg.nyc.taxis; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 3 | 1000376 | 7.4 | 32.35 | N | 2024-01-02 01:14:45.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 1 | 1000375 | 8.8 | 55.55 | Y | 2024-01-01 08:10:22.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -6 rows in set (0.11 sec) -``` - -Use the `iceberg_meta` table function to query the snapshot information of the table: - -``` -mysql> select * from iceberg_meta("table" = "iceberg.nyc.taxis", "query_type" = "snapshots"); -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| committed_at | snapshot_id | parent_id | operation | manifest_list | summary | -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| 2024-07-29 03:38:22 | 8483933166442433486 | -1 | append | s3://warehouse/wh/nyc/taxis/metadata/snap-8483933166442433486-1-5f7b7736-8022-4ba1-9db2-51ae7553be4d.avro | {"added-data-files":"4","added-records":"4","added-files-size":"14156","changed-partition-count":"4","total-records":"4","total-files-size":"14156","total-data-files":"4","total-delete-files":"0","total-position-deletes":"0","total-equality-deletes":"0"} | -| 2024-07-29 03:40:23 | 4726331391239920914 | 8483933166442433486 | append | s3://warehouse/wh/nyc/taxis/metadata/snap-4726331391239920914-1-6aa3d142-6c9c-4553-9c04-08ad4d49a4ea.avro | {"added-data-files":"2","added-records":"2","added-files-size":"7078","changed-partition-count":"2","total-records":"6","total-files-size":"21234","total-data-files":"6","total-delete-files":"0","total-position-deletes":"0","total-equality-deletes":"0"} | -+---------------------+---------------------+---------------------+-----------+-----------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -2 rows in set (0.07 sec) -``` - -Query a specified snapshot using the `FOR VERSION AS OF` statement: - -``` -mysql> SELECT * FROM iceberg.nyc.taxis FOR VERSION AS OF 8483933166442433486; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.05 sec) - -mysql> SELECT * FROM iceberg.nyc.taxis FOR VERSION AS OF 4726331391239920914; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000375 | 8.8 | 55.55 | Y | 2024-01-01 08:10:22.000000 | -| 3 | 1000376 | 7.4 | 32.35 | N | 2024-01-02 01:14:45.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -6 rows in set (0.04 sec) -``` - -Query a specified snapshot using the `FOR TIME AS OF` statement: - -``` -mysql> SELECT * FROM iceberg.nyc.taxis FOR TIME AS OF "2024-07-29 03:38:23"; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.04 sec) - -mysql> SELECT * FROM iceberg.nyc.taxis FOR TIME AS OF "2024-07-29 03:40:22"; -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| vendor_id | trip_id | trip_distance | fare_amount | store_and_fwd_flag | ts | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -| 2 | 1000373 | 0.9 | 9.01 | N | 2024-01-01 03:25:15.000000 | -| 1 | 1000374 | 8.4 | 42.13 | Y | 2024-01-03 07:12:33.000000 | -| 2 | 1000372 | 2.5 | 22.15 | N | 2024-01-02 12:10:11.000000 | -| 1 | 1000371 | 1.8 | 15.32 | N | 2024-01-01 09:15:23.000000 | -+-----------+---------+---------------+-------------+--------------------+----------------------------+ -4 rows in set (0.05 sec) -``` - -### 07 Interacting with PyIceberg - -> Please use Doris 2.1.8/3.0.4 or above. - -Load an iceberg table: - -```python -from pyiceberg.catalog import load_catalog - -catalog = load_catalog( - "iceberg", - **{ - "warehouse" = "warehouse", - "uri" = "http://rest:8181", - "s3.access-key-id" = "admin", - "s3.secret-access-key" = "password", - "s3.endpoint" = "http://minio:9000" - }, -) -table = catalog.load_table("nyc.taxis") -``` - -Read table as `Arrow Table`: - -```python -print(table.scan().to_arrow()) - -pyarrow.Table -vendor_id: int64 -trip_id: int64 -trip_distance: float -fare_amount: double -store_and_fwd_flag: large_string -ts: timestamp[us] ----- -vendor_id: [[1],[1],[2],[2]] -trip_id: [[1000371],[1000374],[1000373],[1000372]] -trip_distance: [[1.8],[8.4],[0.9],[2.5]] -fare_amount: [[15.32],[42.13],[9.01],[22.15]] -store_and_fwd_flag: [["N"],["Y"],["N"],["N"]] -ts: [[2024-01-01 09:15:23.000000],[2024-01-03 07:12:33.000000],[2024-01-01 03:25:15.000000],[2024-01-02 12:10:11.000000]] -``` - -Read table as `Pandas DataFrame`: - -```python -print(table.scan().to_pandas()) - -vendor_id trip_id trip_distance fare_amount store_and_fwd_flag ts -0 1 1000371 1.8 15.32 N 2024-01-01 09:15:23 -1 1 1000374 8.4 42.13 Y 2024-01-03 07:12:33 -2 2 1000373 0.9 9.01 N 2024-01-01 03:25:15 -3 2 1000372 2.5 22.15 N 2024-01-02 12:10:11 -``` - -Read table as `Polars DataFrame`: - -```python -import polars as pl - -print(pl.scan_iceberg(table).collect()) - -shape: (4, 6) -┌───────────┬─────────┬───────────────┬─────────────┬────────────────────┬─────────────────────┐ -│ vendor_id ┆ trip_id ┆ trip_distance ┆ fare_amount ┆ store_and_fwd_flag ┆ ts │ -│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ -│ i64 ┆ i64 ┆ f32 ┆ f64 ┆ str ┆ datetime[μs] │ -╞═══════════╪═════════╪═══════════════╪═════════════╪════════════════════╪═════════════════════╡ -│ 1 ┆ 1000371 ┆ 1.8 ┆ 15.32 ┆ N ┆ 2024-01-01 09:15:23 │ -│ 1 ┆ 1000374 ┆ 8.4 ┆ 42.13 ┆ Y ┆ 2024-01-03 07:12:33 │ -│ 2 ┆ 1000373 ┆ 0.9 ┆ 9.01 ┆ N ┆ 2024-01-01 03:25:15 │ -│ 2 ┆ 1000372 ┆ 2.5 ┆ 22.15 ┆ N ┆ 2024-01-02 12:10:11 │ -└───────────┴─────────┴───────────────┴─────────────┴────────────────────┴─────────────────────┘ -``` - -> Write iceberg table by PyIceberg, please see [step](#write-iceberg-table-by-pyiceberg) - -### 08 Appendix - -#### Write iceberg table by PyIceberg - -Load an iceberg table: - -```python -from pyiceberg.catalog import load_catalog - -catalog = load_catalog( - "iceberg", - **{ - "warehouse" = "warehouse", - "uri" = "http://rest:8181", - "s3.access-key-id" = "admin", - "s3.secret-access-key" = "password", - "s3.endpoint" = "http://minio:9000" - }, -) -table = catalog.load_table("nyc.taxis") -``` - -Write table with `Arrow Table` : - -```python -import pyarrow as pa - -df = pa.Table.from_pydict( - { - "vendor_id": pa.array([1, 2, 2, 1], pa.int64()), - "trip_id": pa.array([1000371, 1000372, 1000373, 1000374], pa.int64()), - "trip_distance": pa.array([1.8, 2.5, 0.9, 8.4], pa.float32()), - "fare_amount": pa.array([15.32, 22.15, 9.01, 42.13], pa.float64()), - "store_and_fwd_flag": pa.array(["N", "N", "N", "Y"], pa.string()), - "ts": pa.compute.strptime( - ["2024-01-01 9:15:23", "2024-01-02 12:10:11", "2024-01-01 3:25:15", "2024-01-03 7:12:33"], - "%Y-%m-%d %H:%M:%S", - "us", - ), - } -) -table.append(df) -``` - -Write table with `Pandas DataFrame` : - -```python -import pyarrow as pa -import pandas as pd - -df = pd.DataFrame( - { - "vendor_id": pd.Series([1, 2, 2, 1]).astype("int64[pyarrow]"), - "trip_id": pd.Series([1000371, 1000372, 1000373, 1000374]).astype("int64[pyarrow]"), - "trip_distance": pd.Series([1.8, 2.5, 0.9, 8.4]).astype("float32[pyarrow]"), - "fare_amount": pd.Series([15.32, 22.15, 9.01, 42.13]).astype("float64[pyarrow]"), - "store_and_fwd_flag": pd.Series(["N", "N", "N", "Y"]).astype("string[pyarrow]"), - "ts": pd.Series(["2024-01-01 9:15:23", "2024-01-02 12:10:11", "2024-01-01 3:25:15", "2024-01-03 7:12:33"]).astype("timestamp[us][pyarrow]"), - } -) -table.append(pa.Table.from_pandas(df)) -``` - -Write table with `Polars DataFrame` : - -```python -import polars as pl - -df = pl.DataFrame( - { - "vendor_id": [1, 2, 2, 1], - "trip_id": [1000371, 1000372, 1000373, 1000374], - "trip_distance": [1.8, 2.5, 0.9, 8.4], - "fare_amount": [15.32, 22.15, 9.01, 42.13], - "store_and_fwd_flag": ["N", "N", "N", "Y"], - "ts": ["2024-01-01 9:15:23", "2024-01-02 12:10:11", "2024-01-01 3:25:15", "2024-01-03 7:12:33"], - }, - { - "vendor_id": pl.Int64, - "trip_id": pl.Int64, - "trip_distance": pl.Float32, - "fare_amount": pl.Float64, - "store_and_fwd_flag": pl.String, - "ts": pl.String, - }, -).with_columns(pl.col("ts").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")) -table.append(df.to_arrow()) -``` diff --git a/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md b/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md deleted file mode 100644 index deb4e5f472423..0000000000000 --- a/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-lakesoul.md +++ /dev/null @@ -1,341 +0,0 @@ ---- -{ - "title": "Using Doris and LakeSoul", - "language": "en" -} - ---- - - - -As a new open data management architecture, the Data Lakehouse integrates the high performance and real-time capabilities of data warehouses with the low cost and flexibility of data lakes, helping users more conveniently meet various data processing and analysis needs. It has been increasingly applied in enterprise big data systems. - -In recent versions, Apache Doris has deepened its integration with data lakes and has evolved a mature Data Lakehouse solution. - -- Since version 0.15, Apache Doris has introduced Hive and Iceberg external tables, exploring the capabilities of combining with Apache Iceberg for data lakes. -- Starting from version 1.2, Apache Doris officially introduced the Multi-Catalog feature, enabling automatic metadata mapping and data access for various data sources, along with numerous performance optimizations for external data reading and query execution. It now fully possesses the ability to build a high-speed and user-friendly Lakehouse architecture. -- In version 2.1, Apache Doris' Data Lakehouse architecture was significantly enhanced, improving the reading and writing capabilities of mainstream data lake formats (Hudi, Iceberg, Paimon, etc.), introducing compatibility with multiple SQL dialects, and seamless migration from existing systems to Apache Doris. For data science and large-scale data reading scenarios, Doris integrated the Arrow Flight high-speed reading interface, achieving a 100-fold increase in data transfer efficiency. - -![Building lakehouse using doris and lakesoul](/images/lakehouse-architecture-for-doris-and-lakesoul.png) - -# Apache Doris & LakeSoul - -LakeSoul is a cloud-native lakehouse framework developed by DMetaSoul and donated to the Linux Foundation AI & Data Foundation in May 2023. It features high scalability in metadata management, ACID transactions, efficient and flexible upsert operations, schema evolution, and batch-stream integrated processing . - -With Apache Doris' high-performance query engine and LakeSoul's efficient data management, users can achieve: - -- Real-time data ingestion into the lake: Leveraging LakeSoul's architecture, data can be ingested into the lake with high efficiency and low latency, supporting various data update capabilities including aggregation, deduplication, and partial column updates. -- High-performance data processing and analysis: LakeSoul's capabilities such as batch-stream integrated processing and schema evolution can be seamlessly integrated with Doris' powerful query engine, enabling fast querying and analysis responses for lake data. - -In the future, Apache Doris will gradually support more advanced features of LakeSoul, such as CDC stream synchronization and automatic schema evolution, to jointly build a unified, high-performance, real-time lakehouse platform. - -This article will explain how to quickly set up an Apache Doris + LakeSoul testing & demonstration environment and demonstrate the usage of various features, showcasing the integration and benefits of using both systems in a lakehouse architecture . - -For more information, please refer to [LakeSoul Catalog](../../../lakehouse/datalake-analytics/lakesoul) - - -## User Guide - -All scripts and code mentioned in this article can be obtained from this address: [https://github.com/apache/doris/tree/master/samples/datalake/lakesoul](https://github.com/apache/doris/tree/master/samples/datalake/lakesoul) - -### 01 Environment Preparation - -This article uses Docker Compose for deployment, with the following components and versions: - -| Component | Version | -| --- | --- | -| Apache Doris | Default 3.0.2| -| LakeSoul | 2.6.1 | -| Postgres | 14.5 | -| Apache Spark | 3.3.1 | -| Apache Flink | 1.17 | -| MinIO | RELEASE.2024-04-29T09-56-05Z | - - -### 02 Environment Deployment - -1. Start all components - - ``` - bash ./start_all.sh - ``` - -2. After starting, you can use the following script to log in to the Doris command line: - - ``` - -- login doris - bash ./start_doris_client.sh - ``` - - -### 03 Data Query - -As shown below, a Catalog named `lakesoul` has been created in the Doris cluster (can be viewed using `SHOW CATALOGS`). The following is the creation statement for this Catalog: - -```sql - -- Already created - CREATE CATALOG `lakesoul` PROPERTIES ( - 'type'='lakesoul', - 'lakesoul.pg.username'='lakesoul_test', - 'lakesoul.pg.password'='lakesoul_test', - 'lakesoul.pg.url'='jdbc:postgresql://lakesoul-meta-pg:5432/lakesoul_test?stringtype=unspecified', - 'minio.endpoint'='http://minio:9000', - 'minio.access_key'='admin', - 'minio.secret_key'='password' - ); - - ``` - The LakeSoul Table `lakesoul.tpch.customer` has already been loaded into Doris. Query the data in Doris. -- Simple query - ```sql - Doris> use `lakesoul`.`tpch`; - Database changed - - Doris> show tables; - +---------------------+ - | Tables_in_tpch | - +---------------------+ - | customer_from_spark | - +---------------------+ - 1 row in set (0.00 sec) - - Doris> select * from customer_from_spark where c_nationkey = 1 order by c_custkey limit 4; - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - | 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | - | 14 | Customer#000000014 | KXkletMlL2JQEA | 1 | 11-845-129-3851 | 5266.30 | FURNITURE | , ironic packages across the unus | - | 30 | Customer#000000030 | nJDsELGAavU63Jl0c5NKsKfL8rIJQQkQnYL2QJY | 1 | 11-764-165-5076 | 9321.01 | BUILDING | lithely final requests. furiously unusual account | - | 59 | Customer#000000059 | zLOCP0wh92OtBihgspOGl4 | 1 | 11-355-584-3112 | 3458.60 | MACHINERY | ously final packages haggle blithely after the express deposits. furiou | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ - 4 rows in set (3.14 sec) - - Doris> select * from customer_from_spark where c_nationkey = 1 order by c_custkey desc limit 4; - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - | c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - | 14983 | Customer#000014983 | ERN3vq5Fvt4DL | 1 | 11-424-279-1846 | 841.22 | AUTOMOBILE | furiously slyly special foxes. express theodolites cajole carefully. special dinos haggle pinto | - | 14968 | Customer#000014968 | ,sykKTZBzVFl7ito1750v2TRYwmkRl2nvqGHwmx | 1 | 11-669-222-9657 | 6106.77 | HOUSEHOLD | ts above the furiously even deposits haggle across | - | 14961 | Customer#000014961 | JEIORcsBp6RpLYH 9gNdDyWJ | 1 | 11-490-251-5554 | 4006.35 | HOUSEHOLD | quests detect carefully final platelets! quickly final frays haggle slyly blithely final acc | - | 14940 | Customer#000014940 | bNoyCxPuqSwPLjbqjEUNGN d0mSP | 1 | 11-242-677-1085 | 8829.48 | HOUSEHOLD | ver the quickly express braids. regular dependencies haggle fluffily quickly i | - +-----------+--------------------+-----------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------+ - 4 rows in set (0.10 sec) - ``` - -- Partition pruning -Doris can perform partition pruning on LakeSoul and speed up the query process through native reading. We can check this through `explain verbose`. - - ```sql - Doris> explain verbose select * from customer_from_spark where c_nationkey < 3; - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | Explain String(Old Planner) | - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - | PLAN FRAGMENT 0 | - | OUTPUT EXPRS: | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_custkey` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_name` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_address` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_nationkey` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_phone` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_acctbal` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_mktsegment` | - | `lakesoul`.`tpch`.`customer_from_spark`.`c_comment` | - | PARTITION: UNPARTITIONED | - | | - | HAS_COLO_PLAN_NODE: false | - | | - | VRESULT SINK | - | MYSQL_PROTOCAL | - | | - | 1:VEXCHANGE | - | offset: 0 | - | tuple ids: 0 | - | | - | PLAN FRAGMENT 1 | - | | - | PARTITION: RANDOM | - | | - | HAS_COLO_PLAN_NODE: false | - | | - | STREAM DATA SINK | - | EXCHANGE ID: 01 | - | UNPARTITIONED | - | | - | 0:VplanNodeName | - | table: customer_from_spark | - | predicates: (`c_nationkey` < 3) | - | inputSplitNum=12, totalFileSize=0, scanRanges=12 | - | partition=0/0 | - | backends: | - | 10002 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00000-0568c817-d6bc-4fa1-bb9e-b311069b131c_00000.c000.parquet start: 0 length: 0 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00001-d99a8fe6-61ab-4285-94da-2f84f8746a8a_00001.c000.parquet start: 0 length: 0 | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=1/part-00002-8a8e396f-685f-4b0f-87fa-e2a3fe5be87e_00002.c000.parquet start: 0 length: 0 | - | ... other 8 files ... | - | s3://lakesoul-test-bucket/data/tpch/customer_from_spark/c_nationkey=0/part-00003-d5b598cd-5bed-412c-a26f-bb4bc9c937bc_00003.c000.parquet start: 0 length: 0 | - | numNodes=1 | - | pushdown agg=NONE | - | tuple ids: 0 | - | | - | Tuples: | - | TupleDescriptor{id=0, tbl=customer_from_spark} | - | SlotDescriptor{id=0, col=c_custkey, colUniqueId=0, type=int, nullable=false, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=1, col=c_name, colUniqueId=1, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=2, col=c_address, colUniqueId=2, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=3, col=c_nationkey, colUniqueId=3, type=int, nullable=false, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=4, col=c_phone, colUniqueId=4, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=5, col=c_acctbal, colUniqueId=5, type=decimalv3(15,2), nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=6, col=c_mktsegment, colUniqueId=6, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - | SlotDescriptor{id=7, col=c_comment, colUniqueId=7, type=text, nullable=true, isAutoIncrement=false, subColPath=null} | - +----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ - 57 rows in set (0.03 sec) - - ``` - - - By examining the result of the `EXPLAIN VERBOSE` statement, it can be seen that the predicate condition `c_nationkey < 3` ultimately only hits one partition (`partition=0/0`). - -### 04 CDC Table Support - -Launch Flink CDC Job to sync mysql table. The mysql table is loaded when launching docker compose. - -``` -bash start_flink_cdc_job.sh -``` - -```sql -Start flink-cdc job... -SLF4J: Class path contains multiple SLF4J bindings. -SLF4J: Found binding in [jar:file:/opt/flink/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class] -SLF4J: Found binding in [jar:file:/opt/hadoop/share/hadoop/common/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class] -SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation. -SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory] -Loading class `com.mysql.jdbc.Driver'. This is deprecated. The new driver class is `com.mysql.cj.jdbc.Driver'. The driver is automatically registered via the SPI and manual loading of the driver class is generally unnecessary. -Job has been submitted with JobID d1b3641dcd1ad85c6b373d49b1867e68 - -``` - - -Flink CDC Job will be launched. We can check the process of launching at `doris client` by recreate the lakesoul catalog. After the Flink CDC Job has been launched, we can see the syncing LakeSoul CDC table at `doris client`. - -```sql -Doris> show tables; -+---------------------+ -| Tables_in_tpch | -+---------------------+ -| customer_from_spark | -+---------------------+ -2 rows in set (0.00 sec) - - -Doris> drop catalog if exists lakesoul; -Query OK, 0 rows affected (0.00 sec) - -Doris> create catalog `lakesoul` properties ('type'='lakesoul', 'lakesoul.pg.username'='lakesoul_test', 'lakesoul.pg.password'='lakesoul_test', 'lakesoul.pg.url'='jdbc:postgresql://lakesoul-meta-pg:5432/lakesoul_test?stringtype=unspecified', 'minio.endpoint'='http://minio:9000', 'minio.access_key'='admin', 'minio.secret_key'='password'); -Query OK, 0 rows affected (0.01 sec) - -Doris> show tables; -+---------------------+ -| Tables_in_tpch | -+---------------------+ -| customer | -| customer_from_spark | -+---------------------+ -2 rows in set (0.00 sec) - -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -9 rows in set (1.09 sec) - -``` - -Enter the `mysql client` and try to modify data. - -``` -bash start_mysql_client.sh -``` - -Try update row from `mysql client`. - -```sql -mysql> update customer set c_acctbal=2211.26 where c_custkey=1; -Query OK, 1 row affected (0.01 sec) -Rows matched: 1 Changed: 1 Warnings: 0 -``` - -Back to `doris client` and check the data changing. - -```sql -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 2211.26 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -9 rows in set (0.11 sec) - -``` - -Try delete row from `mysql client`. - -```sql -mysql> delete from customer where c_custkey = 2; -Query OK, 1 row affected (0.01 sec) -``` - -Back to `doris client` and check the data changing. - -```sql -Doris> select c_custkey, c_name, c_address, c_nationkey , c_phone, c_acctbal , c_mktsegment , c_comment from lakesoul.tpch.customer where c_custkey < 10; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -| 6 | Customer#000000006 | sKZz0CsnMD7mp4Xd0YrBvx,LREYKUWAh yVn | 20 | 30-114-968-4951 | 7638.57 | AUTOMOBILE | tions. even deposits boost according to the slyly bold packages. final accounts cajole requests. furious | -| 9 | Customer#000000009 | xKiAFTjUsCuxfeleNqefumTrjS | 8 | 18-338-906-3675 | 8324.07 | FURNITURE | r theodolites according to the requests wake thinly excuses: pending requests haggle furiousl | -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 2211.26 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 7 | Customer#000000007 | TcGe5gaZNgVePxU5kRrvXBfkasDTea | 18 | 28-190-982-9759 | 9561.95 | AUTOMOBILE | ainst the ironic, express theodolites. express, even pinto beans among the exp | -| 8 | Customer#000000008 | I0B10bB0AymmC, 0PrRYBCP1yGJ8xcBPmWhl5 | 17 | 27-147-574-9335 | 6819.74 | BUILDING | among the slyly regular theodolites kindle blithely courts. carefully even theodolites haggle slyly along the ide | -| 4 | Customer#000000004 | XxVSJsLAGtn | 4 | 14-128-190-5944 | 2866.83 | MACHINERY | requests. final, regular ideas sleep final accou | -| 5 | Customer#000000005 | KvpyuHCplrB84WgAiGV6sYpZq7Tj | 3 | 13-750-942-6364 | 794.47 | HOUSEHOLD | n accounts will have to unwind. foxes cajole accor | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------------------+ -8 rows in set (0.11 sec) - -``` diff --git a/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-paimon.md b/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-paimon.md deleted file mode 100644 index 26aba30e9623e..0000000000000 --- a/versioned_docs/version-2.0/gettingStarted/tutorials/building-lakehouse/doris-paimon.md +++ /dev/null @@ -1,270 +0,0 @@ ---- -{ - "title": "Using Doris and Paimon", - "language": "en" -} - ---- - - - -As a new open data management architecture, the Data Lakehouse integrates the high performance and real-time capabilities of data warehouses with the low cost and flexibility of data lakes, helping users more conveniently meet various data processing and analysis needs. It has been increasingly applied in enterprise big data systems. - -In recent versions, Apache Doris has deepened its integration with data lakes and has evolved a mature Data Lakehouse solution. - -- Since version 0.15, Apache Doris has introduced Hive and Iceberg external tables, exploring the capabilities of combining with Apache Iceberg for data lakes. -- Starting from version 1.2, Apache Doris officially introduced the Multi-Catalog feature, achieving automatic metadata mapping and data access for various data sources, along with many performance optimizations for external data reading and query execution. It now fully possesses the ability to build a high-speed and user-friendly Lakehouse architecture. -- In version 2.1, Apache Doris' Data Lakehouse architecture was significantly enhanced, strengthening the reading and writing capabilities of mainstream data lake formats (Hudi, Iceberg, Paimon, etc.), introducing compatibility with multiple SQL dialects, and seamless migration from existing systems to Apache Doris. For data science and large-scale data reading scenarios, Doris integrated the Arrow Flight high-speed reading interface, achieving a 100-fold improvement in data transfer efficiency. - -![Building lakehouse using Doris and Paimon](/images/lakehouse-architecture-for-doris-and-paimon.png) - -## Apache Doris & Paimon - -Apache Paimon is a data lake format that innovatively combines the advantages of data lake formats and LSM structures, successfully introducing efficient real-time streaming update capabilities into data lake architecture. This enables Paimon to efficiently manage data and perform real-time analysis, providing strong support for building real-time Data Lakehouse architecture. - -To fully leverage Paimon's capabilities and improve query efficiency for Paimon data, Apache Doris provides native support for several of Paimon's latest features: - -- Supports various types of Paimon Catalogs such as Hive Metastore and FileSystem. -- Native support for Paimon 0.6's Primary Key Table Read Optimized feature. -- Native support for Paimon 0.8's Primary Key Table Deletion Vector feature. - -With Apache Doris' high-performance query engine and Apache Paimon's efficient real-time streaming update capabilities, users can achieve: - -- Real-time data ingestion into the lake: Leveraging Paimon's LSM-Tree model, data ingestion into the lake can be reduced to a minute-level timeliness. Additionally, Paimon supports various data update capabilities including aggregation, deduplication, and partial column updates, making data flow more flexible and efficient. -- High-performance data processing and analysis: Paimon's technologies such as Append Only Table, Read Optimized, and Deletion Vector can be seamlessly integrated with Doris' powerful query engine, enabling fast querying and analysis responses for lake data. - -In the future, Apache Doris will gradually support more advanced features of Apache Paimon, including Time Travel and incremental data reading, to jointly build a unified, high-performance, real-time lakehouse platform. - -This article will explain how to quickly set up an Apache Doris + Apache Paimon testing & demonstration environment in a Docker environment and demonstrate the usage of various features. - -For more information, please refer to [Paimon Catalog](../../../lakehouse/datalake-analytics/paimon.md) - -## User Guide - -All scripts and code mentioned in this article can be obtained from the following address: [https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon](https://github.com/apache/doris/tree/master/samples/datalake/iceberg_and_paimon) - -### 01 Environment Preparation - -This article uses Docker Compose for deployment, with the following components and versions: - -| Component | Version | -| --- | --- | -| Apache Doris | Default 2.1.5, can be modified | -| Apache Paimon | 0.8 | -| Apache Flink | 1.18 | -| MinIO | RELEASE.2024-04-29T09-56-05Z | - -### 02 Environment Deployment - -1. Start all components - - `bash ./start_all.sh` - -2. After starting, you can use the following scripts to log in to the Flink command line or Doris command line: - - ``` - -- login flink - bash ./start_flink_client.sh - - -- login doris - bash ./start_doris_client.sh - ``` - -### 03 Data Preparation - -After logging into the Flink command line, you can see a pre-built table. The table already contains some data that can be viewed using Flink SQL. - -``` -Flink SQL> use paimon.db_paimon; -[INFO] Execute statement succeed. - -Flink SQL> show tables; -+------------+ -| table name | -+------------+ -| customer | -+------------+ -1 row in set - -Flink SQL> show create table customer; -+------------------------------------------------------------------------+ -| result | -+------------------------------------------------------------------------+ -| CREATE TABLE `paimon`.`db_paimon`.`customer` ( - `c_custkey` INT NOT NULL, - `c_name` VARCHAR(25), - `c_address` VARCHAR(40), - `c_nationkey` INT NOT NULL, - `c_phone` CHAR(15), - `c_acctbal` DECIMAL(12, 2), - `c_mktsegment` CHAR(10), - `c_comment` VARCHAR(117), - CONSTRAINT `PK_c_custkey_c_nationkey` PRIMARY KEY (`c_custkey`, `c_nationkey`) NOT ENFORCED -) PARTITIONED BY (`c_nationkey`) -WITH ( - 'bucket' = '1', - 'path' = 's3://warehouse/wh/db_paimon.db/customer', - 'deletion-vectors.enabled' = 'true' -) - | -+-------------------------------------------------------------------------+ -1 row in set - -Flink SQL> desc customer; -+--------------+----------------+-------+-----------------------------+--------+-----------+ -| name | type | null | key | extras | watermark | -+--------------+----------------+-------+-----------------------------+--------+-----------+ -| c_custkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | -| c_name | VARCHAR(25) | TRUE | | | | -| c_address | VARCHAR(40) | TRUE | | | | -| c_nationkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | -| c_phone | CHAR(15) | TRUE | | | | -| c_acctbal | DECIMAL(12, 2) | TRUE | | | | -| c_mktsegment | CHAR(10) | TRUE | | | | -| c_comment | VARCHAR(117) | TRUE | | | | -+--------------+----------------+-------+-----------------------------+--------+-----------+ -8 rows in set - -Flink SQL> select * from customer order by c_custkey limit 4; -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platel... | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic... | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic,... | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tl... | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious ... | -+-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ -4 rows in set -``` - -### 04 Data Query - -As shown below, a Catalog named `paimon` has been created in the Doris cluster (can be viewed using SHOW CATALOGS). The following is the statement for creating this Catalog: - -``` --- 已创建,无需执行 -CREATE CATALOG `paimon` PROPERTIES ( - "type" = "paimon", - "warehouse" = "s3://warehouse/wh/", - "s3.endpoint"="http://minio:9000", - "s3.access_key"="admin", - "s3.secret_key"="password", - "s3.region"="us-east-1" -); -``` - -You can query Paimon's data in Doris: - -``` -mysql> use paimon.db_paimon; -Reading table information for completion of table and column names -You can turn off this feature to get a quicker startup with -A - -Database changed -mysql> show tables; -+---------------------+ -| Tables_in_db_paimon | -+---------------------+ -| customer | -+---------------------+ -1 row in set (0.00 sec) - -mysql> select * from customer order by c_custkey limit 4; -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | -| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | -| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | -+-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -4 rows in set (1.89 sec) -``` - -### 05 Read Incremental Data - -You can update the data in the Paimon table using Flink SQL: - -``` -Flink SQL> update customer set c_address='c_address_update' where c_nationkey = 1; -[INFO] Submitting SQL update statement to the cluster... -[INFO] SQL update statement has been successfully submitted to the cluster: -Job ID: ff838b7b778a94396b332b0d93c8f7ac -``` - -After the Flink SQL execution is complete, you can directly view the latest data in Doris: - -``` -mysql> select * from customer where c_nationkey=1 limit 2; -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -| 3 | Customer#000000003 | c_address_update | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | -| 513 | Customer#000000513 | c_address_update | 1 | 11-861-303-6887 | 955.37 | HOUSEHOLD | press along the quickly regular instructions. regular requests against the carefully ironic s | -+-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ -2 rows in set (0.19 sec) -``` - -### Benchmark - -We conducted a simple test on the TPCDS 1000 dataset in Paimon (0.8) version, using Apache Doris 2.1.5 version and Trino 422 version, both with the Primary Key Table Read Optimized feature enabled. - -![](/images/quick-start/lakehouse-paimon-benchmark.PNG) - -From the test results, it can be seen that Doris' average query performance on the standard static test set is 3-5 times that of Trino. In the future, we will optimize the Deletion Vector to further improve query efficiency in real business scenarios. - -## Query Optimization - -For baseline data, after introducing the Primary Key Table Read Optimized feature in Apache Paimon version 0.6, the query engine can directly access the underlying Parquet/ORC files, significantly improving the reading efficiency of baseline data. For unmerged incremental data (data increments generated by INSERT, UPDATE, or DELETE), they can be read through Merge-on-Read. In addition, Paimon introduced the Deletion Vector feature in version 0.8, which further enhances the query engine's efficiency in reading incremental data. -Apache Doris supports reading Deletion Vector through native Reader and performing Merge on Read. We demonstrate the query methods for baseline data and incremental data in a query using Doris' EXPLAIN statement. - -``` -mysql> explain verbose select * from customer where c_nationkey < 3; -+------------------------------------------------------------------------------------------------------------------------------------------------+ -| Explain String(Nereids Planner) | -+------------------------------------------------------------------------------------------------------------------------------------------------+ -| ............... | -| | -| 0:VPAIMON_SCAN_NODE(68) | -| table: customer | -| predicates: (c_nationkey[#3] < 3) | -| inputSplitNum=4, totalFileSize=238324, scanRanges=4 | -| partition=3/0 | -| backends: | -| 10002 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-15cee5b7-1bd7-42ca-9314-56d92c62c03b-0.orc start: 0 length: 66600 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-5d50255a-2215-4010-b976-d5dc656f3444-0.orc start: 0 length: 44501 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=2/bucket-0/data-e98fb7ef-ec2b-4ad5-a496-713cb9481d56-0.orc start: 0 length: 64059 | -| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=0/bucket-0/data-431be05d-50fa-401f-9680-d646757d0f95-0.orc start: 0 length: 63164 | -| cardinality=18751, numNodes=1 | -| pushdown agg=NONE | -| paimonNativeReadSplits=4/4 | -| PaimonSplitStats: | -| SplitStat [type=NATIVE, rowCount=1542, rawFileConvertable=true, hasDeletionVector=true] | -| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | -| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | -| tuple ids: 0 -| ............... | | -+------------------------------------------------------------------------------------------------------------------------------------------------+ -67 rows in set (0.23 sec) -``` - -It can be seen that the table just updated by Flink SQL contains 4 shards, and all shards can be accessed through Native Reader (paimonNativeReadSplits=4/4). In addition, the hasDeletionVector property of the first shard is true, indicating that the shard has a corresponding Deletion Vector, and data will be filtered according to the Deletion Vector when reading. - diff --git a/versioned_docs/version-2.0/table-design/data-model/duplicate.md b/versioned_docs/version-2.0/table-design/data-model/duplicate.md index eef7c116fb39d..d39ed3252adbc 100644 --- a/versioned_docs/version-2.0/table-design/data-model/duplicate.md +++ b/versioned_docs/version-2.0/table-design/data-model/duplicate.md @@ -38,7 +38,7 @@ For example, a table has the following data columns and requires the retention o | op_id | BIGINT | No | Operator ID | | op_time | DATETIME | No | Operation time | -## **Duplicate Model with Sort Columns ** +## Duplicate Model with Sort Columns In the table creation statement, the `Duplicate Key` can be designated to indicate that data storage should be sorted according to these key columns. When choosing the `Duplicate Key`, it is recommended to select the first 2-4 columns. @@ -73,7 +73,7 @@ MySQL> desc example_tbl_duplicate; +------------+---------------+------+-------+---------+-------+ ``` -## **Default Duplicate Model** +## Default Duplicate Model When no data model (Unique, Aggregate, or Duplicate) is specified during table creation, a Duplicate model table is created by default, and the sort columns are automatically selected according to certain rules. For example, in the following table creation statement, if no data model is specified, a Duplicate model table will be established, and the system will automatically select the first three columns as the sort columns. diff --git a/versioned_docs/version-2.0/table-design/schema-change.md b/versioned_docs/version-2.0/table-design/schema-change.md index babda7a7015b2..77c813675eb8c 100644 --- a/versioned_docs/version-2.0/table-design/schema-change.md +++ b/versioned_docs/version-2.0/table-design/schema-change.md @@ -1,6 +1,6 @@ --- { - "title": "Schema Evolution", + "title": "Schema Change", "language": "en" } --- diff --git a/versioned_docs/version-2.1/data-operate/export/export-manual.md b/versioned_docs/version-2.1/data-operate/export/export-manual.md index eb7e782eaeecc..6533bf292bf11 100644 --- a/versioned_docs/version-2.1/data-operate/export/export-manual.md +++ b/versioned_docs/version-2.1/data-operate/export/export-manual.md @@ -1,6 +1,6 @@ --- { - "title": "Export", + "title": "Using EXPORT Command", "language": "en" } --- diff --git a/versioned_docs/version-2.1/data-operate/export/export-with-mysql-dump.md b/versioned_docs/version-2.1/data-operate/export/export-with-mysql-dump.md index 121ce811a99a8..f1aacd79126da 100644 --- a/versioned_docs/version-2.1/data-operate/export/export-with-mysql-dump.md +++ b/versioned_docs/version-2.1/data-operate/export/export-with-mysql-dump.md @@ -1,6 +1,6 @@ --- { -"title": "MySQL Dump", +"title": "Using MySQL Dump", "language": "en" } --- diff --git a/versioned_docs/version-2.1/data-operate/export/outfile.md b/versioned_docs/version-2.1/data-operate/export/outfile.md index 9e08f558a3e1f..cb0eaa14beb2d 100644 --- a/versioned_docs/version-2.1/data-operate/export/outfile.md +++ b/versioned_docs/version-2.1/data-operate/export/outfile.md @@ -1,6 +1,6 @@ --- { - "title": "Select Info Outfile", + "title": "Using SELECT INTO OUTFILE Command", "language": "en" } --- diff --git a/versioned_docs/version-2.1/db-connect/arrow-flight-sql-connect.md b/versioned_docs/version-2.1/db-connect/arrow-flight-sql-connect.md index adfd0dc540a3e..54382516bb980 100644 --- a/versioned_docs/version-2.1/db-connect/arrow-flight-sql-connect.md +++ b/versioned_docs/version-2.1/db-connect/arrow-flight-sql-connect.md @@ -1,6 +1,6 @@ --- { - "title": "High-speed data transmission link based on Arrow Flight SQL", + "title": "Connecting by Arrow Flight SQL Protocol", "language": "en" } --- diff --git a/versioned_docs/version-2.1/db-connect/database-connect.md b/versioned_docs/version-2.1/db-connect/database-connect.md index f09c90eaf9a20..bd01e8a68d7d4 100644 --- a/versioned_docs/version-2.1/db-connect/database-connect.md +++ b/versioned_docs/version-2.1/db-connect/database-connect.md @@ -1,6 +1,6 @@ --- { - "title": "Connecting to Database", + "title": "Connecting by MySQL Protocol", "language": "en" } --- diff --git a/versioned_docs/version-2.1/gettingStarted/tutorials/log-storage-analysis.md b/versioned_docs/version-2.1/gettingStarted/tutorials/log-storage-analysis.md deleted file mode 100644 index 9d54040e66696..0000000000000 --- a/versioned_docs/version-2.1/gettingStarted/tutorials/log-storage-analysis.md +++ /dev/null @@ -1,591 +0,0 @@ ---- -{ - "title": "Building log analysis platform", - "language": "en" -} ---- - - - -Logs record key events in the system and contain crucial information such as the events' subject, time, location, and content. To meet the diverse needs of observability in operations, network security monitoring, and business analysis, enterprises might need to collect scattered logs for centralized storage, querying, and analysis to extract valuable content from the log data further. - -In this scenario, Apache Doris provides a corresponding solution. With the characteristics of log scenarios in mind, Apache Doris added inverted-index and ultra-fast full-text search capabilities, optimizing write performance and storage space to the extreme. This allows users to build an open, high-performance, cost-effective, and unified log storage and analysis platform based on Apache Doris. - -Focused on this solution, this chapter contains the following 3 sections: - -- **Overall architecture**: This section explains the core components and architecture of the log storage and analysis platform built on Apache Doris. - -- **Features and advantages**: This section explains the features and advantages of the log storage and analysis platform built on Apache Doris. - -- **Operational guide**: This section explains how to build a log storage and analysis platform based on Apache Doris. - -## Overall architecture - -The following figure illustrates the architecture of the log storage and analysis platform built on Apache Doris: - -![Overall architecture](/images/doris-overall-architecture.png) - -The architecture contains the following 3 parts: - -- **Log collection and preprocessing**: Various log collection tools can write log data into Apache Doris through HTTP APIs. - -- **Log storage and analysis engine**: Apache Doris provides high-performance, low-cost unified log storage, offering rich retrieval and analysis capabilities through SQL interfaces. - -- **Log analysis and alert interface**: Various log retrieval and analysis tools can query Apache Doris through standard SQL interfaces, providing users with a simple and user-friendly interface. - -## Features and advantages - -The following figure illustrates the architecture of the log storage and analysis platform built on Apache Doris:: - -- **High throughput, low latency log writing**: Supports stable writing of log data at the level of hundreds of TBs and GB/s per day, while maintaining latency within 1 second. - -- **Cost-effective storage of massive log data**: Supports petabyte-scale storage, saving 60% to 80% in storage costs compared to Elasticsearch, and further reducing storage costs by 50% by storing cold data in S3/HDFS. - -- **High-performance log full-text search and analysis**: Supports inverted indexes and full-text search, providing second-level response times for common log queries (keyword searches, trend analysis, etc.). - -- **Open and user-friendly upstream and downstream ecosystem**: Upstream integration with common log collection systems and data sources such as Logstash, Filebeat, Fluentbit, and Kafka through Stream Load's universal HTTP APIs, and downstream integration with various visual analytics UIs using standard MySQL protocol and syntax, such as observability platform Grafana, BI analytics Superset, and log retrieval Doris WebUI similar to Kibana. - -### Cost-effective performance - -After Benchmark testing and production validation, the log storage and analysis platform built on Apache Doris has a 5 to 10 times cost-performance advantage over Elasticsearch. Apache Doris's performance benefits are mainly due to its globally leading high-performance storage and query engine, as well as specialized optimizations for log scenarios: - -- **Improved write throughput**: Elasticsearch's write performance bottleneck lies in CPU consumption for parsing data and building inverted indexes. In comparison, Apache Doris has optimized writes in two aspects: using SIMD and other CPU vector instructions to improve JSON data parsing speed and index-building performance and simplifying the inverted index structure for log scenarios by removing unnecessary data structures like forward indexes, effectively reducing index build complexity. With the same resources, Apache Doris's write performance is 3 to 5 times higher than Elasticsearch. - -- **Reduced Storage Costs**: The storage bottleneck in Elasticsearch lies in the multiple storage formats for forward indexes, inverted indexes, and Docvalue columns, as well as the relatively low compression ratio of its general compression algorithms. In contrast, Apache Doris has made the following optimizations in storage: it removes the forward index, reducing the index data size by 30%; it uses columnar storage and the Zstandard compression algorithm, achieving a compression ratio of 5 to 10 times, which is significantly higher than Elasticsearch's 1.5 times; in log data, cold data is accessed very infrequently, and Apache Doris's hot and cold data tiering feature can automatically store logs that exceed a defined time period into lower-cost object storage, reducing the storage cost of cold data by more than 70%. For the same raw data, Doris's storage costs are only about 20% of those of Elasticsearch. - -### Strong analytical capabilities - -Apache Doris supports standard SQL and is compatible with MySQL protocol and syntax. Therefore, log systems built on Apache Doris can use SQL for log analysis, giving the following advantages to log systems: - -- **Easy to use**: Engineers and data analysts are very familiar with SQL, their expertise can be reused, no need to learn new technology stacks to quickly get started. - -- **Rich ecosystem**: The MySQL ecosystem is the most widely used language in the database field, seamlessly integrating with and applying to the MySQL ecosystem. Doris can leverage MySQL command line and various GUI tools, BI tools, and other big data ecosystem tools for more complex and diverse data processing and analysis needs. - -- **Strong analytical capabilities**: SQL has become the de facto standard for database and big data analysis, with powerful expressive capabilities and functions supporting retrieval, aggregation, multi-table JOIN, subqueries, UDFs, logical views, materialized views, and various data analysis capabilities. - -### Flexible Schema - -Here is a typical example of a semi-structured log in JSON format. The top-level fields are some fixed fields, such as timestamp, source, node, component, level, clientRequestID, message, and properties, which are present in every log entry. The nested fields of the properties , such as properties.size and properties.format are more dynamic, and the fields of each log may vary. - -```JSON -{ - "timestamp": "2014-03-08T00:50:03.8432810Z", - "source": "ADOPTIONCUSTOMERS81", - "node": "Engine000000000405", - "level": "Information", - "component": "DOWNLOADER", - "clientRequestId": "671db15d-abad-94f6-dd93-b3a2e6000672", - "message": "Downloading file path: benchmark/2014/ADOPTIONCUSTOMERS81_94_0.parquet.gz", - "properties": { - "size": 1495636750, - "format": "parquet", - "rowCount": 855138, - "downloadDuration": "00:01:58.3520561" - } -} -``` - - -Apache Doris provides several aspects of support for Flexible Schema log data: - -- For changes to top-level fields, Light Schema Change can be used to add or remove columns and to add or remove indexes, enabling schema changes to be completed in seconds. When planning a log platform, users only need to consider which fields need to be indexed. - -- For extension fields similar to properties, the native semi-structured data type `VARIANT` is provided, which can write any JSON data, automatically recognize field names and types in JSON, and automatically split frequently occurring fields for columnar storage for subsequent analysis. Additionally, `VARIANT` can create inverted indexes to accelerate internal field queries and retrievals. - -Compared to Elasticsearch's Dynamic Mapping, Apache Doris's Flexible Schema has the following advantages: - -- Allows a field to have multiple types, `VARIANT` automatically handles conflicts and type promotion for fields, better adapting to iterative changes in log data. - -- `VARIANT` automatically merges infrequently occurring fields into a column store to avoid performance issues caused by excessive fields, metadata, or columns. - -- Not only can columns be dynamically added, but they can also be dynamically deleted, and indexes can be dynamically added or removed, eliminating the need to index all fields at the beginning like Elasticsearch, reducing unnecessary costs. - -## Operational guide - -### Step 1: Estimate resources - -Before deploying the cluster, you need to estimate the hardware resources required for the servers. Follow the steps below: - -1. Estimate the resources for data writing by the following calculation formulas: - -- `Average write throughput = Daily data increment / 86400 s` - -- `Peak write throughput = Average write throughput \* Ratio of the peak write throughput to the average write throughput` - -- `Number of CPU cores for the peak write throughput = Peak write throughput / Write throughput of a single-core CPU` - -1. Estimate the resources for data storage by the calculation formula: `Storage space = Daily data increment / Data compression ratio * Number of data copies * Data storage duration`. - -2. Estimate the resources for data querying. The resources for data querying depend on the query volume and complexity. It is recommended to reserve 50% of CPU resources for data query initially and then adjust according to the actual test results. - -3. Integrate the calculation results as follows: - - 1. Divide the number of CPU cores calculated in Step 1 and Step 3 by the number of CPU cores of a BE server, and you can get the number of BE servers. - - 2. Based on the number of BE servers and the calculation result of Step 2, estimate the storage space required for each BE server. - - 3. Allocate the storage space required for each BE server to 4 to 12 data disks, and you can get the storage capacity required for a single data disk. - -For example, suppose that the daily data increment is 100 TB, the data compression ratio is 5, the number of data copies is 1, the storage duration of hot data is 3 days, the storage duration of cold data is 30 days, the ratio of the peak write throughput to the average write throughput is 200%, the write throughput of a single-core CUP is 10 MB/s, and 50% of CPU resources are reserved for data querying, one can estimate that: - -- 3 FE servers are required, each configured with a 16-core CPU, 64 GB memory, and an 1 100 GB SSD disk. - -- 15 BE servers are required, each configured with a 32-core CPU, 256 GB memory, and 10 600 GB SSD disks. - -- S3 object storage space 600 TB - -Refer to the following table to learn about the values of indicators in the example above and how they are calculated. - -| Indicator (Unit) | Value | Description | -| --- | --- | --- | -| Daily data increment (TB) | 100 | Specify the value according to your actual needs. | -| Data compression ratio | 5 | Specify the value according to your actual needs, which is often between 3 to 10. Note that the data contains index data. | -| Number of data copies | 1 | Specify the value according to your actual needs, which can be 1, 2, or 3. The default value is 1. | -| Storage duration of hot data (day) | 3 | Specify the value according to your actual needs. | -| Storage duration of cold data (day) | 30 | Specify the value according to your actual needs. | -| Data storage duration | 33 | Calculation formula: `Storage duration of hot data + Storage duration of cold data` | -| Estimated storage space for hot data (TB) | 60 | Calculation formula: `Daily data increment / Data compression ratios * Number of data copies * Storage duration of hot data` | -| Estimated storage space for cold data (TB) | 600 | Calculation formula: `Daily data increment / Data compression ratios * Number of data copies * Storage duration of cold data` | -| Ratio of the peak write throughput to the average write throughput | 200% | Specify the value according to your actual needs. The default value is 200%. | -| Number of CPU cores of a BE server | 32 | Specify the value according to your actual needs. The default value is 32. | -| Average write throughput (MB/s) | 1214 | Calculation formula: `Daily data increment / 86400 s` | -| Peak write throughput (MB/s) | 2427 | Calculation formula: `Average write throughput * Ratio of the peak write throughput to the average write throughput` | -| Number of CPU cores for the peak write throughput | 242.7 | Calculation formula: `Peak write throughput / Write throughput of a single-core CPU` | -| Percent of CPU resources reserved for data querying | 50% | Specify the value according to your actual needs. The default value is 50%. | -| Estimated number of BE servers | 15.2 | Calculation formula: `Number of CPU cores for the peak write throughput / Number of CPU cores of a BE server /(1 - Percent of CPU resources reserved for data querying)` | -| Rounded number of BE servers | 15 | Calculation formula: `MAX (Number of data copies, Estimated number of BE servers)` | -| Estimated data storage space for each BE server (TB) | 5.7 | Calculation formula: `Estimated storage space for hot data / Estimated number of BE servers /(1 - 30%)`, where 30% represents the percent of reserved storage space.

It is recommended to mount 4 to 12 data disks on each BE server to enhance I/O capabilities. | - -### Step 2: Deploy the cluster - -After estimating the resources, you need to deploy the cluster. It is recommended to deploy in both physical and virtual environments manually. For manual deployment, refer to [Manual Deployment](../../install/cluster-deployment/standard-deployment.md). - -### Step 3: Optimize FE and BE configurations - -After completing the cluster deployment, it is necessary to optimize the configuration parameters for both the front-end and back-end separately, so as to better suit the scenario of log storage and analysis. - -**Optimize FE configurations** - -You can find FE configuration fields in `fe/conf/fe.conf`. Refer to the following table to optimize FE configurations. - -| Configuration fields to be optimized | Description | -| :----------------------------------------------------------- | :----------------------------------------------------------- | -| `max_running_txn_num_per_db = 10000` | Increase the parameter value to adapt to high-concurrency import transactions. | -| `streaming_label_keep_max_second = 3600` `label_keep_max_second = 7200` | Increase the retention time to handle high-frequency import transactions with high memory usage. | -| `enable_round_robin_create_tablet = true` | When creating Tablets, use a Round Robin strategy to distribute evenly. | -| `tablet_rebalancer_type = partition` | When balancing Tablets, use a strategy to evenly distribute within each partition. | -| `autobucket_min_buckets = 10` | Increase the minimum number of automatically bucketed buckets from 1 to 10 to avoid insufficient buckets when the log volume increases. | -| `max_backend_heartbeat_failure_tolerance_count = 10` | In log scenarios, the BE server may experience high pressure, leading to short-term timeouts, so increase the tolerance count from 1 to 10. | - -For more information, refer to [FE Configuration](../../admin-manual/config/fe-config.md). - -**Optimize BE configurations** - -You can find BE configuration fields in `be/conf/be.conf`. Refer to the following table to optimize BE configurations. - -| Module | Configuration fields to be optimized | Description | -| :--------- | :----------------------------------------------------------- | :----------------------------------------------------------- | -| Storage | `storage_root_path = /path/to/dir1;/path/to/dir2;...;/path/to/dir12` | Configure the storage path for hot data on disk directories. | -| - | `enable_file_cache = true` | Enable file caching. | -| - | `file_cache_path = [{"path": "/mnt/datadisk0/file_cache", "total_size":53687091200, "query_limit": "10737418240"},{"path": "/mnt/datadisk1/file_cache", "total_size":53687091200,"query_limit": "10737418240"}]` | Configure the cache path and related settings for cold data with the following specific configurations:
`path`: cache path
`total_size`: total size of the cache path in bytes, where 53687091200 bytes equals 50 GB
`query_limit`: maximum amount of data that can be queried from the cache path in one query in bytes, where 10737418240 bytes equals 10 GB | -| Write | `write_buffer_size = 1073741824` | Increase the file size of the write buffer to reduce small files and random I/O operations, improving performance. | -| - | `max_tablet_version_num = 20000` | In coordination with the time_series compaction strategy for table creation, allow more versions to remain temporarily unmerged | -| Compaction | `max_cumu_compaction_threads = 8` | Set to CPU core count / 4, indicating that 1/4 of CPU resources are used for writing, 1/4 for background compaction, and 2/1 for queries and other operations. | -| - | `inverted_index_compaction_enable = true` | Enable inverted index compaction to reduce CPU consumption during compaction. | -| - | `enable_segcompaction = false` `enable_ordered_data_compaction = false` | Disable two compaction features that are unnecessary for log scenarios. | -| - | `enable_compaction_priority_scheduling = false` | Low-priority compaction is limited to 2 tasks on a single disk, which can affect the speed of compaction. | -| - | `total_permits_for_compaction_score = 200000 ` | The parameter is used to control memory, under the memory time series strategy, the parameter itself can control memory. | -| Cache | `disable_storage_page_cache = true` `inverted_index_searcher_cache_limit = 30%` | Due to the large volume of log data and limited caching effect, switch from data caching to index caching. | -| - | `inverted_index_cache_stale_sweep_time_sec = 3600` `index_cache_entry_stay_time_after_lookup_s = 3600` | Maintain index caching in memory for up to 1 hour. | -| - | `enable_inverted_index_cache_on_cooldown = true`
`enable_write_index_searcher_cache = false` | Enable automatic caching of cold data storage during index uploading. | -| - | `tablet_schema_cache_recycle_interval = 3600` `segment_cache_capacity = 20000` | Reduce memory usage by other caches. | -| - | `inverted_index_ram_dir_enable = true` | Reduce the IO overhead caused by writing to index files temporarily. | -| Thread | `pipeline_executor_size = 24` `doris_scanner_thread_pool_thread_num = 48` | Configure computing threads and I/O threads for a 32-core CPU in proportion to core count. | -| - | `scan_thread_nice_value = 5` | Lower the priority of query I/O threads to ensure writing performance and timeliness. | -| Other | `string_type_length_soft_limit_bytes = 10485760` | Increase the length limit of string-type data to 10 MB. | -| - | `trash_file_expire_time_sec = 300` `path_gc_check_interval_second = 900` `path_scan_interval_second = 900` | Accelerate the recycling of trash files. | - - -For more information, refer to [BE Configuration](../../admin-manual/config/be-config.md). - -### Step 4: Create tables - -Due to the distinct characteristics of both writing and querying log data, it is recommended to configure tables with targeted settings to enhance performance. - -**Configure data partitioning and bucketing** - -- For data partitioning: - - - Enable [range partitioning](../../table-design/data-partitioning/dynamic-partitioning#range-partition) (`PARTITION BY RANGE(`ts`)`) with [dynamic partitions](../../table-design/data-partitioning/dynamic-partitioning) (`"dynamic_partition.enable" = "true"`) managed automatically by day. - - - Use a field in the DATETIME type as the key (`DUPLICATE KEY(ts)`) for accelerated retrieval of the latest N log entries. - -- For data bucketing: - - - Configure the number of buckets to be roughly three times the total number of disks in the cluster, with each bucket containing approximately 5GB of data after compression. - - - Use the Random strategy (`DISTRIBUTED BY RANDOM BUCKETS 60`) to optimize batch writing efficiency when paired with single tablet imports. - -For more information, refer to [Data Partitioning](../../table-design/data-partitioning/basic-concepts). - -**Configure compression parameters** - -Use the zstd compression algorithm ("compression" = "zstd") to improve data compression efficiency. - -**Configure compaction parameters** - -Configure compaction fields as follows: - -- Use the time_series strategy (`"compaction_policy" = "time_series"`) to reduce write amplification, which is crucial for high-throughput log writes. - -**Configure index parameters** - -Configuring index fields as follows: - -- Create indexes for fields that are frequently queried (`USING INVERTED`). - -- For fields that require full-text search, specify the parser field as unicode, which satisfies most requirements. If there is a need to support phrase queries, set the support_phrase field to true; if not needed, set it to false to reduce storage space. - -**Configure storage parameters** - -Configure storage policies as follows: - -- For storage of hot data, if using cloud storage, configure the number of data copies as 1; if using physical disks, configure the number of data copies as at least 2 (`"replication_num" = "2"`). - -- Configure the storage location for log_s3 (`CREATE RESOURCE "log_s3"`) and set the log_policy_3day policy (`CREATE STORAGE POLICY log_policy_3day`), where the data is cooled and moved to the specified storage location of log_s3 after 3 days. Refer to the code below. - -```SQL -CREATE DATABASE log_db; -USE log_db; - -CREATE RESOURCE "log_s3" -PROPERTIES -( - "type" = "s3", - "s3.endpoint" = "your_endpoint_url", - "s3.region" = "your_region", - "s3.bucket" = "your_bucket", - "s3.root.path" = "your_path", - "s3.access_key" = "your_ak", - "s3.secret_key" = "your_sk" -); - -CREATE STORAGE POLICY log_policy_3day -PROPERTIES( - "storage_resource" = "log_s3", - "cooldown_ttl" = "259200" -); - -CREATE TABLE log_table -( - `ts` DATETIME, - `host` TEXT, - `path` TEXT, - `message` TEXT, - INDEX idx_host (`host`) USING INVERTED, - INDEX idx_path (`path`) USING INVERTED, - INDEX idx_message (`message`) USING INVERTED PROPERTIES("parser" = "unicode", "support_phrase" = "true") -) -ENGINE = OLAP -DUPLICATE KEY(`ts`) -PARTITION BY RANGE(`ts`) () -DISTRIBUTED BY RANDOM BUCKETS 60 -PROPERTIES ( - "compression" = "zstd", - "compaction_policy" = "time_series", - "dynamic_partition.enable" = "true", - "dynamic_partition.create_history_partition" = "true", - "dynamic_partition.time_unit" = "DAY", - "dynamic_partition.start" = "-30", - "dynamic_partition.end" = "1", - "dynamic_partition.prefix" = "p", - "dynamic_partition.buckets" = "60", - "dynamic_partition.replication_num" = "2", -- unneccessary for the compute-storage coupled mode - "replication_num" = "2", -- unneccessary for the compute-storage coupled mode - "storage_policy" = "log_policy_3day" -- unneccessary for the compute-storage coupled mode -); -``` - -### Step 5: Collect logs - -After completing table creation, you can proceed with log collection. - -Apache Doris provides open and versatile Stream HTTP APIs, through which you can connect with popular log collectors such as Logstash, Filebeat, Kafka, and others to carry out log collection work. This section explains how to integrate these log collectors using the Stream HTTP APIs. - -**Integrating Logstash** - -Follow these steps: - -1. Download and install the Logstash Doris Output plugin. You can choose one of the following two methods: - - - [Click to download](https://apache-doris-releases.oss-accelerate.aliyuncs.com/logstash-output-doris-1.0.0.gem) and install. - - - Compile from the source code and run the following command to install: - -```markdown -./bin/logstash-plugin install logstash-output-doris-1.0.0.gem -``` - -2. Configure Logstash. Specify the following fields: - -- `logstash.yml`: Used to configure Logstash batch processing log sizes and timings for improved data writing performance. - -```Plain Text -pipeline.batch.size: 1000000 -pipeline.batch.delay: 10000 -``` - -- `logstash_demo.conf`: Used to configure the specific input path of the collected logs and the settings for output to Apache Doris. - -``` -input { - file { - path => "/path/to/your/log" - } -} - -output { - doris { - http_hosts => [ "", "", "] - user => "your_username" - password => "your_password" - db => "your_db" - table => "your_table" - - # doris stream load http headers - headers => { - "format" => "json" - "read_json_by_line" => "true" - "load_to_single_tablet" => "true" - } - - # field mapping: doris fileld name => logstash field name - # %{} to get a logstash field, [] for nested field such as [host][name] for host.name - mapping => { - "ts" => "%{@timestamp}" - "host" => "%{[host][name]}" - "path" => "%{[log][file][path]}" - "message" => "%{message}" - } - log_request => true - log_speed_interval => 10 - } -} - ``` - -3. Run Logstash according to the command below, collect logs, and output to Apache Doris. - -```shell -./bin/logstash -f logstash_demo.conf -``` - -For more information about the Logstash Doris Output plugin, see [Logstash Doris Output Plugin](../../ecosystem/logstash.md). - -**Integrating Filebeat** - -Follow these steps: - -1. Obtain the Filebeat binary file that supports output to Apache Doris. You can [click to download](https://apache-doris-releases.oss-accelerate.aliyuncs.com/filebeat-doris-1.0.0) or compile it from the Apache Doris source code. - -2. Configure Filebeat. Specify the filebeat_demo.yml field that is used to configure the specific input path of the collected logs and the settings for output to Apache Doris. - -```YAML -# input -filebeat.inputs: -- type: log -enabled: true -paths: - - /path/to/your/log -multiline: - type: pattern - pattern: '^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' - negate: true - match: after - skip_newline: true - -processors: -- script: - lang: javascript - source: > - function process(event) { - var msg = event.Get("message"); - msg = msg.replace(/\t/g, " "); - event.Put("message", msg); - } -- dissect: - # 2024-06-08 18:26:25,481 INFO (report-thread|199) [ReportHandler.cpuReport():617] begin to handle - tokenizer: "%{day} %{time} %{log_level} (%{thread}) [%{position}] %{content}" - target_prefix: "" - ignore_failure: true - overwrite_keys: true - -# queue and batch -queue.mem: -events: 1000000 -flush.min_events: 100000 -flush.timeout: 10s - -# output -output.doris: -fenodes: [ "http://fehost1:http_port", "http://fehost2:http_port", "http://fehost3:http_port" ] -user: "your_username" -password: "your_password" -database: "your_db" -table: "your_table" -# output string format -codec_format_string: '{"ts": "%{[day]} %{[time]}", "host": "%{[agent][hostname]}", "path": "%{[log][file][path]}", "message": "%{[message]}"}' -headers: - format: "json" - read_json_by_line: "true" - load_to_single_tablet: "true" -``` - -3. Run Filebeat according to the command below, collect logs, and output to Apache Doris. - - ```shell - chmod +x filebeat-doris-1.0.0 - ./filebeat-doris-1.0.0 -c filebeat_demo.yml - ``` - -For more information about Filebeat, refer to [Beats Doris Output Plugin](../../ecosystem/beats.md). - -**Integrating Kafka** - -Write JSON formatted logs to Kafka's message queue, create a Kafka Routine Load, and allow Apache Doris to actively pull data from Kafka. - -You can refer to the example below, where `property.*` represents Librdkafka client-related configurations and needs to be adjusted according to the actual Kafka cluster situation. - -```SQL -CREATE ROUTINE LOAD load_log_kafka ON log_db.log_table -COLUMNS(ts, clientip, request, status, size) -PROPERTIES ( -"max_batch_interval" = "10", -"max_batch_rows" = "1000000", -"max_batch_size" = "109715200", -"load_to_single_tablet" = "true", -"timeout" = "600", -"strict_mode" = "false", -"format" = "json" -) -FROM KAFKA ( -"kafka_broker_list" = "host:port", -"kafka_topic" = "log__topic_", -"property.group.id" = "your_group_id", -"property.security.protocol"="SASL_PLAINTEXT", -"property.sasl.mechanism"="GSSAPI", -"property.sasl.kerberos.service.name"="kafka", -"property.sasl.kerberos.keytab"="/path/to/xxx.keytab", -"property.sasl.kerberos.principal"="" -); -
SHOW ROUTINE LOAD; -``` - -For more information about Kafka, see [Routine Load](../../data-operate/import/import-way/routine-load-manual.md). - -**Using customized programs to collect logs** - -In addition to integrating common log collectors, you can also customize programs to import log data into Apache Doris using the Stream Load HTTP API. Refer to the following code: - -```shell -curl ---location-trusted --u username:password --H "format:json" --H "read_json_by_line:true" --H "load_to_single_tablet:true" --H "timeout:600" --T logfile.json -http://fe_host:fe_http_port/api/log_db/log_table/_stream_load -``` - -When using custom programs, pay attention to the following key points: - -- Use Basic Auth for HTTP authentication, calculate using the command echo -n 'username:password' | base64. - -- Set HTTP header "format:json" to specify the data format as JSON. - -- Set HTTP header "read_json_by_line:true" to specify one JSON per line. - -- Set HTTP header "load_to_single_tablet:true" to import data into one bucket at a time to reduce small file imports. - -- It is recommended to write batches whose sizes are between 100MB to 1GB on the client side. For Apache Doris version 2.1 and higher, you need to reduce batch sizes on the client side through the Group Commit function. - -### Step 6: Query and analyze logs - -**Query logs** - -Apache Doris supports standard SQL, so you can connect to the cluster through MySQL client or JDBC to execute SQL for log queries. - -```Plain Text -mysql -h fe_host -P fe_mysql_port -u your_username -Dyour_db_name -``` - -Here are 5 common SQL query commands for reference: - -- View the latest 10 log entries - -```SQL -SELECT * FROM your_table_name ORDER BY ts DESC LIMIT 10; -``` - -- Query the latest 10 log entries with the host as 8.8.8.8 - -```SQL -SELECT * FROM your_table_name WHERE host = '8.8.8.8' ORDER BY ts DESC LIMIT 10; -``` - -- Retrieve the latest 10 log entries with error or 404 in the request field. In the command below, MATCH_ANY is a full-text search SQL syntax used by Apache Doris for matching any keyword in the fields. - -```SQL -SELECT * FROM your_table_name WHERE message **MATCH_ANY** 'error 404' -ORDER BY ts DESC LIMIT 10; -``` - -- Retrieve the latest 10 log entries with image and faq in the request field. In the command below, MATCH_ALL is a full-text search SQL syntax used by Apache Doris for matching all keywords in the fields. - -```SQL -SELECT * FROM your_table_name WHERE message **MATCH_ALL** 'image faq' -ORDER BY ts DESC LIMIT 10; -``` - -- Retrieve the latest 10 entries with image and faq in the request field. In the following command, MATCH_PHRASE is a full-text search SQL syntax used by Apache Doris for matching all keywords in the fields and requiring consistent order. In the example below, a image faq b can match, but a faq image b cannot match because the order of image and faq does not match the syntax. - -```SQL -SELECT * FROM your_table_name WHERE message **MATCH_PHRASE** 'image faq' -ORDER BY ts DESC LIMIT 10; -``` - -**Analyze logs visually** - -Some third-party vendors offer visual log analysis development platforms based on Apache Doris, which include a log search and analysis interface similar to Kibana Discover. These platforms provide an intuitive and user-friendly exploratory log analysis interaction. - -![WebUI](/images/WebUI-EN.jpeg) - -- Support for full-text search and SQL modes - -- Support for selecting query log timeframes with time boxes and histograms - -- Display of detailed log information, expandable into JSON or tables - -- Interactive clicking to add and remove filter conditions in the log data context - -- Display of top field values in search results for finding anomalies and further drilling down for analysis - -Please contact dev@doris.apache.org to find more. diff --git a/versioned_docs/version-2.1/install/deploy-manually/storage-compute-coupled-deploy-manually.md b/versioned_docs/version-2.1/install/deploy-manually/storage-compute-coupled-deploy-manually.md index 7bc0748a6c945..9adca0fcac183 100644 --- a/versioned_docs/version-2.1/install/deploy-manually/storage-compute-coupled-deploy-manually.md +++ b/versioned_docs/version-2.1/install/deploy-manually/storage-compute-coupled-deploy-manually.md @@ -1,6 +1,6 @@ --- { - "title": "Deploy Storage Compute Coupled Manually", + "title": "Deploying Manually", "language": "en" } --- diff --git a/versioned_docs/version-1.2/gettingStarted/tutorials/log-storage-analysis.md b/versioned_docs/version-2.1/log-storage-analysis.md similarity index 99% rename from versioned_docs/version-1.2/gettingStarted/tutorials/log-storage-analysis.md rename to versioned_docs/version-2.1/log-storage-analysis.md index 9d54040e66696..dd20c029d04b5 100644 --- a/versioned_docs/version-1.2/gettingStarted/tutorials/log-storage-analysis.md +++ b/versioned_docs/version-2.1/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "Building log analysis platform", + "title": "Log Storage and Analysis", "language": "en" } --- diff --git a/versioned_docs/version-2.1/practical-guide/log-storage-analysis.md b/versioned_docs/version-2.1/practical-guide/log-storage-analysis.md index 8dec55a6a27bc..750c395087e9c 100644 --- a/versioned_docs/version-2.1/practical-guide/log-storage-analysis.md +++ b/versioned_docs/version-2.1/practical-guide/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "Building log analysis platform", + "title": "Log Storage and Analysis", "language": "en" } --- diff --git a/versioned_docs/version-2.1/table-design/data-model/aggregate.md b/versioned_docs/version-2.1/table-design/data-model/aggregate.md index 987f909395a46..20fe8b8baee68 100644 --- a/versioned_docs/version-2.1/table-design/data-model/aggregate.md +++ b/versioned_docs/version-2.1/table-design/data-model/aggregate.md @@ -1,7 +1,7 @@ --- { - "title": "聚合模型", - "language": "zh-CN" + "title": "Aggregate Model", + "language": "en" } --- diff --git a/versioned_docs/version-2.1/table-design/data-model/duplicate.md b/versioned_docs/version-2.1/table-design/data-model/duplicate.md index f7de4e87a66b4..7df0ad1161214 100644 --- a/versioned_docs/version-2.1/table-design/data-model/duplicate.md +++ b/versioned_docs/version-2.1/table-design/data-model/duplicate.md @@ -1,6 +1,6 @@ --- { - "title": "明细模型", + "title": "Detail Model", "language": "zh-CN" } --- diff --git a/versioned_docs/version-2.1/table-design/data-model/overview.md b/versioned_docs/version-2.1/table-design/data-model/overview.md index e2578e8c6d7d7..d1de425396593 100644 --- a/versioned_docs/version-2.1/table-design/data-model/overview.md +++ b/versioned_docs/version-2.1/table-design/data-model/overview.md @@ -1,7 +1,7 @@ --- { - "title": "模型概述", - "language": "zh-CN" + "title": "Table Model Overview", + "language": "en" } --- @@ -34,7 +34,7 @@ Doris supports three types of table models: * **Primary Key Model (Unique Key Model)**: Ensures that each row has a unique Key value, and guarantees that there are no duplicate rows for a given Key column. The Doris storage layer retains only the latest written data for each key, making this model suitable for scenarios that involve data updates. -* **Aggregation Model (Aggregate Key Model)**: Allows data to be aggregated based on the Key columns. The Doris storage layer retains aggregated data, reducing storage space and improving query performance. This model is typically used in situations where summary or aggregated information (such as totals or averages) is required. +* **Aggregate Model (Aggregate Key Model)**: Allows data to be aggregated based on the Key columns. The Doris storage layer retains aggregated data, reducing storage space and improving query performance. This model is typically used in situations where summary or aggregated information (such as totals or averages) is required. Once the table is created, the table model attributes are confirmed and cannot be modified. It is crucial to choose the appropriate model based on business requirements: diff --git a/versioned_docs/version-2.1/table-design/data-model/unique.md b/versioned_docs/version-2.1/table-design/data-model/unique.md index 52e8ae1e11536..e80e983875cb3 100644 --- a/versioned_docs/version-2.1/table-design/data-model/unique.md +++ b/versioned_docs/version-2.1/table-design/data-model/unique.md @@ -1,7 +1,7 @@ --- { - "title": "主键模型", - "language": "zh-CN" + "title": "Primary Key Model", + "language": "en" } --- diff --git a/versioned_docs/version-2.1/table-design/schema-change.md b/versioned_docs/version-2.1/table-design/schema-change.md index e036394812082..2455e2baca342 100644 --- a/versioned_docs/version-2.1/table-design/schema-change.md +++ b/versioned_docs/version-2.1/table-design/schema-change.md @@ -1,6 +1,6 @@ --- { - "title": "Schema Evolution", + "title": "Schema Change", "language": "en" } --- diff --git a/versioned_docs/version-2.1/table-design/tiered-storage/overview.md b/versioned_docs/version-2.1/table-design/tiered-storage/overview.md index 6a7d3af05a336..f9003e67139d4 100644 --- a/versioned_docs/version-2.1/table-design/tiered-storage/overview.md +++ b/versioned_docs/version-2.1/table-design/tiered-storage/overview.md @@ -1,6 +1,6 @@ --- { - "title": "Tiered Storage", + "title": "Tiered Storage Overview", "language": "en-US" } --- diff --git a/versioned_docs/version-3.0/data-operate/export/export-manual.md b/versioned_docs/version-3.0/data-operate/export/export-manual.md index 0f42a9db9a972..c039c522d28d3 100644 --- a/versioned_docs/version-3.0/data-operate/export/export-manual.md +++ b/versioned_docs/version-3.0/data-operate/export/export-manual.md @@ -1,6 +1,6 @@ --- { - "title": "Export", + "title": "Using EXPORT Command", "language": "en" } --- diff --git a/versioned_docs/version-3.0/data-operate/export/export-with-mysql-dump.md b/versioned_docs/version-3.0/data-operate/export/export-with-mysql-dump.md index 121ce811a99a8..f1aacd79126da 100644 --- a/versioned_docs/version-3.0/data-operate/export/export-with-mysql-dump.md +++ b/versioned_docs/version-3.0/data-operate/export/export-with-mysql-dump.md @@ -1,6 +1,6 @@ --- { -"title": "MySQL Dump", +"title": "Using MySQL Dump", "language": "en" } --- diff --git a/versioned_docs/version-3.0/data-operate/export/outfile.md b/versioned_docs/version-3.0/data-operate/export/outfile.md index b1bd46d5e4cb8..cb0eaa14beb2d 100644 --- a/versioned_docs/version-3.0/data-operate/export/outfile.md +++ b/versioned_docs/version-3.0/data-operate/export/outfile.md @@ -1,6 +1,6 @@ --- { - "title": "Select Into Outfile", + "title": "Using SELECT INTO OUTFILE Command", "language": "en" } --- diff --git a/versioned_docs/version-3.0/db-connect/arrow-flight-sql-connect.md b/versioned_docs/version-3.0/db-connect/arrow-flight-sql-connect.md index adfd0dc540a3e..54382516bb980 100644 --- a/versioned_docs/version-3.0/db-connect/arrow-flight-sql-connect.md +++ b/versioned_docs/version-3.0/db-connect/arrow-flight-sql-connect.md @@ -1,6 +1,6 @@ --- { - "title": "High-speed data transmission link based on Arrow Flight SQL", + "title": "Connecting by Arrow Flight SQL Protocol", "language": "en" } --- diff --git a/versioned_docs/version-3.0/db-connect/database-connect.md b/versioned_docs/version-3.0/db-connect/database-connect.md index f09c90eaf9a20..bd01e8a68d7d4 100644 --- a/versioned_docs/version-3.0/db-connect/database-connect.md +++ b/versioned_docs/version-3.0/db-connect/database-connect.md @@ -1,6 +1,6 @@ --- { - "title": "Connecting to Database", + "title": "Connecting by MySQL Protocol", "language": "en" } --- diff --git a/versioned_docs/version-3.0/gettingStarted/tutorials/log-storage-analysis.md b/versioned_docs/version-3.0/gettingStarted/tutorials/log-storage-analysis.md deleted file mode 100644 index 9d54040e66696..0000000000000 --- a/versioned_docs/version-3.0/gettingStarted/tutorials/log-storage-analysis.md +++ /dev/null @@ -1,591 +0,0 @@ ---- -{ - "title": "Building log analysis platform", - "language": "en" -} ---- - - - -Logs record key events in the system and contain crucial information such as the events' subject, time, location, and content. To meet the diverse needs of observability in operations, network security monitoring, and business analysis, enterprises might need to collect scattered logs for centralized storage, querying, and analysis to extract valuable content from the log data further. - -In this scenario, Apache Doris provides a corresponding solution. With the characteristics of log scenarios in mind, Apache Doris added inverted-index and ultra-fast full-text search capabilities, optimizing write performance and storage space to the extreme. This allows users to build an open, high-performance, cost-effective, and unified log storage and analysis platform based on Apache Doris. - -Focused on this solution, this chapter contains the following 3 sections: - -- **Overall architecture**: This section explains the core components and architecture of the log storage and analysis platform built on Apache Doris. - -- **Features and advantages**: This section explains the features and advantages of the log storage and analysis platform built on Apache Doris. - -- **Operational guide**: This section explains how to build a log storage and analysis platform based on Apache Doris. - -## Overall architecture - -The following figure illustrates the architecture of the log storage and analysis platform built on Apache Doris: - -![Overall architecture](/images/doris-overall-architecture.png) - -The architecture contains the following 3 parts: - -- **Log collection and preprocessing**: Various log collection tools can write log data into Apache Doris through HTTP APIs. - -- **Log storage and analysis engine**: Apache Doris provides high-performance, low-cost unified log storage, offering rich retrieval and analysis capabilities through SQL interfaces. - -- **Log analysis and alert interface**: Various log retrieval and analysis tools can query Apache Doris through standard SQL interfaces, providing users with a simple and user-friendly interface. - -## Features and advantages - -The following figure illustrates the architecture of the log storage and analysis platform built on Apache Doris:: - -- **High throughput, low latency log writing**: Supports stable writing of log data at the level of hundreds of TBs and GB/s per day, while maintaining latency within 1 second. - -- **Cost-effective storage of massive log data**: Supports petabyte-scale storage, saving 60% to 80% in storage costs compared to Elasticsearch, and further reducing storage costs by 50% by storing cold data in S3/HDFS. - -- **High-performance log full-text search and analysis**: Supports inverted indexes and full-text search, providing second-level response times for common log queries (keyword searches, trend analysis, etc.). - -- **Open and user-friendly upstream and downstream ecosystem**: Upstream integration with common log collection systems and data sources such as Logstash, Filebeat, Fluentbit, and Kafka through Stream Load's universal HTTP APIs, and downstream integration with various visual analytics UIs using standard MySQL protocol and syntax, such as observability platform Grafana, BI analytics Superset, and log retrieval Doris WebUI similar to Kibana. - -### Cost-effective performance - -After Benchmark testing and production validation, the log storage and analysis platform built on Apache Doris has a 5 to 10 times cost-performance advantage over Elasticsearch. Apache Doris's performance benefits are mainly due to its globally leading high-performance storage and query engine, as well as specialized optimizations for log scenarios: - -- **Improved write throughput**: Elasticsearch's write performance bottleneck lies in CPU consumption for parsing data and building inverted indexes. In comparison, Apache Doris has optimized writes in two aspects: using SIMD and other CPU vector instructions to improve JSON data parsing speed and index-building performance and simplifying the inverted index structure for log scenarios by removing unnecessary data structures like forward indexes, effectively reducing index build complexity. With the same resources, Apache Doris's write performance is 3 to 5 times higher than Elasticsearch. - -- **Reduced Storage Costs**: The storage bottleneck in Elasticsearch lies in the multiple storage formats for forward indexes, inverted indexes, and Docvalue columns, as well as the relatively low compression ratio of its general compression algorithms. In contrast, Apache Doris has made the following optimizations in storage: it removes the forward index, reducing the index data size by 30%; it uses columnar storage and the Zstandard compression algorithm, achieving a compression ratio of 5 to 10 times, which is significantly higher than Elasticsearch's 1.5 times; in log data, cold data is accessed very infrequently, and Apache Doris's hot and cold data tiering feature can automatically store logs that exceed a defined time period into lower-cost object storage, reducing the storage cost of cold data by more than 70%. For the same raw data, Doris's storage costs are only about 20% of those of Elasticsearch. - -### Strong analytical capabilities - -Apache Doris supports standard SQL and is compatible with MySQL protocol and syntax. Therefore, log systems built on Apache Doris can use SQL for log analysis, giving the following advantages to log systems: - -- **Easy to use**: Engineers and data analysts are very familiar with SQL, their expertise can be reused, no need to learn new technology stacks to quickly get started. - -- **Rich ecosystem**: The MySQL ecosystem is the most widely used language in the database field, seamlessly integrating with and applying to the MySQL ecosystem. Doris can leverage MySQL command line and various GUI tools, BI tools, and other big data ecosystem tools for more complex and diverse data processing and analysis needs. - -- **Strong analytical capabilities**: SQL has become the de facto standard for database and big data analysis, with powerful expressive capabilities and functions supporting retrieval, aggregation, multi-table JOIN, subqueries, UDFs, logical views, materialized views, and various data analysis capabilities. - -### Flexible Schema - -Here is a typical example of a semi-structured log in JSON format. The top-level fields are some fixed fields, such as timestamp, source, node, component, level, clientRequestID, message, and properties, which are present in every log entry. The nested fields of the properties , such as properties.size and properties.format are more dynamic, and the fields of each log may vary. - -```JSON -{ - "timestamp": "2014-03-08T00:50:03.8432810Z", - "source": "ADOPTIONCUSTOMERS81", - "node": "Engine000000000405", - "level": "Information", - "component": "DOWNLOADER", - "clientRequestId": "671db15d-abad-94f6-dd93-b3a2e6000672", - "message": "Downloading file path: benchmark/2014/ADOPTIONCUSTOMERS81_94_0.parquet.gz", - "properties": { - "size": 1495636750, - "format": "parquet", - "rowCount": 855138, - "downloadDuration": "00:01:58.3520561" - } -} -``` - - -Apache Doris provides several aspects of support for Flexible Schema log data: - -- For changes to top-level fields, Light Schema Change can be used to add or remove columns and to add or remove indexes, enabling schema changes to be completed in seconds. When planning a log platform, users only need to consider which fields need to be indexed. - -- For extension fields similar to properties, the native semi-structured data type `VARIANT` is provided, which can write any JSON data, automatically recognize field names and types in JSON, and automatically split frequently occurring fields for columnar storage for subsequent analysis. Additionally, `VARIANT` can create inverted indexes to accelerate internal field queries and retrievals. - -Compared to Elasticsearch's Dynamic Mapping, Apache Doris's Flexible Schema has the following advantages: - -- Allows a field to have multiple types, `VARIANT` automatically handles conflicts and type promotion for fields, better adapting to iterative changes in log data. - -- `VARIANT` automatically merges infrequently occurring fields into a column store to avoid performance issues caused by excessive fields, metadata, or columns. - -- Not only can columns be dynamically added, but they can also be dynamically deleted, and indexes can be dynamically added or removed, eliminating the need to index all fields at the beginning like Elasticsearch, reducing unnecessary costs. - -## Operational guide - -### Step 1: Estimate resources - -Before deploying the cluster, you need to estimate the hardware resources required for the servers. Follow the steps below: - -1. Estimate the resources for data writing by the following calculation formulas: - -- `Average write throughput = Daily data increment / 86400 s` - -- `Peak write throughput = Average write throughput \* Ratio of the peak write throughput to the average write throughput` - -- `Number of CPU cores for the peak write throughput = Peak write throughput / Write throughput of a single-core CPU` - -1. Estimate the resources for data storage by the calculation formula: `Storage space = Daily data increment / Data compression ratio * Number of data copies * Data storage duration`. - -2. Estimate the resources for data querying. The resources for data querying depend on the query volume and complexity. It is recommended to reserve 50% of CPU resources for data query initially and then adjust according to the actual test results. - -3. Integrate the calculation results as follows: - - 1. Divide the number of CPU cores calculated in Step 1 and Step 3 by the number of CPU cores of a BE server, and you can get the number of BE servers. - - 2. Based on the number of BE servers and the calculation result of Step 2, estimate the storage space required for each BE server. - - 3. Allocate the storage space required for each BE server to 4 to 12 data disks, and you can get the storage capacity required for a single data disk. - -For example, suppose that the daily data increment is 100 TB, the data compression ratio is 5, the number of data copies is 1, the storage duration of hot data is 3 days, the storage duration of cold data is 30 days, the ratio of the peak write throughput to the average write throughput is 200%, the write throughput of a single-core CUP is 10 MB/s, and 50% of CPU resources are reserved for data querying, one can estimate that: - -- 3 FE servers are required, each configured with a 16-core CPU, 64 GB memory, and an 1 100 GB SSD disk. - -- 15 BE servers are required, each configured with a 32-core CPU, 256 GB memory, and 10 600 GB SSD disks. - -- S3 object storage space 600 TB - -Refer to the following table to learn about the values of indicators in the example above and how they are calculated. - -| Indicator (Unit) | Value | Description | -| --- | --- | --- | -| Daily data increment (TB) | 100 | Specify the value according to your actual needs. | -| Data compression ratio | 5 | Specify the value according to your actual needs, which is often between 3 to 10. Note that the data contains index data. | -| Number of data copies | 1 | Specify the value according to your actual needs, which can be 1, 2, or 3. The default value is 1. | -| Storage duration of hot data (day) | 3 | Specify the value according to your actual needs. | -| Storage duration of cold data (day) | 30 | Specify the value according to your actual needs. | -| Data storage duration | 33 | Calculation formula: `Storage duration of hot data + Storage duration of cold data` | -| Estimated storage space for hot data (TB) | 60 | Calculation formula: `Daily data increment / Data compression ratios * Number of data copies * Storage duration of hot data` | -| Estimated storage space for cold data (TB) | 600 | Calculation formula: `Daily data increment / Data compression ratios * Number of data copies * Storage duration of cold data` | -| Ratio of the peak write throughput to the average write throughput | 200% | Specify the value according to your actual needs. The default value is 200%. | -| Number of CPU cores of a BE server | 32 | Specify the value according to your actual needs. The default value is 32. | -| Average write throughput (MB/s) | 1214 | Calculation formula: `Daily data increment / 86400 s` | -| Peak write throughput (MB/s) | 2427 | Calculation formula: `Average write throughput * Ratio of the peak write throughput to the average write throughput` | -| Number of CPU cores for the peak write throughput | 242.7 | Calculation formula: `Peak write throughput / Write throughput of a single-core CPU` | -| Percent of CPU resources reserved for data querying | 50% | Specify the value according to your actual needs. The default value is 50%. | -| Estimated number of BE servers | 15.2 | Calculation formula: `Number of CPU cores for the peak write throughput / Number of CPU cores of a BE server /(1 - Percent of CPU resources reserved for data querying)` | -| Rounded number of BE servers | 15 | Calculation formula: `MAX (Number of data copies, Estimated number of BE servers)` | -| Estimated data storage space for each BE server (TB) | 5.7 | Calculation formula: `Estimated storage space for hot data / Estimated number of BE servers /(1 - 30%)`, where 30% represents the percent of reserved storage space.

It is recommended to mount 4 to 12 data disks on each BE server to enhance I/O capabilities. | - -### Step 2: Deploy the cluster - -After estimating the resources, you need to deploy the cluster. It is recommended to deploy in both physical and virtual environments manually. For manual deployment, refer to [Manual Deployment](../../install/cluster-deployment/standard-deployment.md). - -### Step 3: Optimize FE and BE configurations - -After completing the cluster deployment, it is necessary to optimize the configuration parameters for both the front-end and back-end separately, so as to better suit the scenario of log storage and analysis. - -**Optimize FE configurations** - -You can find FE configuration fields in `fe/conf/fe.conf`. Refer to the following table to optimize FE configurations. - -| Configuration fields to be optimized | Description | -| :----------------------------------------------------------- | :----------------------------------------------------------- | -| `max_running_txn_num_per_db = 10000` | Increase the parameter value to adapt to high-concurrency import transactions. | -| `streaming_label_keep_max_second = 3600` `label_keep_max_second = 7200` | Increase the retention time to handle high-frequency import transactions with high memory usage. | -| `enable_round_robin_create_tablet = true` | When creating Tablets, use a Round Robin strategy to distribute evenly. | -| `tablet_rebalancer_type = partition` | When balancing Tablets, use a strategy to evenly distribute within each partition. | -| `autobucket_min_buckets = 10` | Increase the minimum number of automatically bucketed buckets from 1 to 10 to avoid insufficient buckets when the log volume increases. | -| `max_backend_heartbeat_failure_tolerance_count = 10` | In log scenarios, the BE server may experience high pressure, leading to short-term timeouts, so increase the tolerance count from 1 to 10. | - -For more information, refer to [FE Configuration](../../admin-manual/config/fe-config.md). - -**Optimize BE configurations** - -You can find BE configuration fields in `be/conf/be.conf`. Refer to the following table to optimize BE configurations. - -| Module | Configuration fields to be optimized | Description | -| :--------- | :----------------------------------------------------------- | :----------------------------------------------------------- | -| Storage | `storage_root_path = /path/to/dir1;/path/to/dir2;...;/path/to/dir12` | Configure the storage path for hot data on disk directories. | -| - | `enable_file_cache = true` | Enable file caching. | -| - | `file_cache_path = [{"path": "/mnt/datadisk0/file_cache", "total_size":53687091200, "query_limit": "10737418240"},{"path": "/mnt/datadisk1/file_cache", "total_size":53687091200,"query_limit": "10737418240"}]` | Configure the cache path and related settings for cold data with the following specific configurations:
`path`: cache path
`total_size`: total size of the cache path in bytes, where 53687091200 bytes equals 50 GB
`query_limit`: maximum amount of data that can be queried from the cache path in one query in bytes, where 10737418240 bytes equals 10 GB | -| Write | `write_buffer_size = 1073741824` | Increase the file size of the write buffer to reduce small files and random I/O operations, improving performance. | -| - | `max_tablet_version_num = 20000` | In coordination with the time_series compaction strategy for table creation, allow more versions to remain temporarily unmerged | -| Compaction | `max_cumu_compaction_threads = 8` | Set to CPU core count / 4, indicating that 1/4 of CPU resources are used for writing, 1/4 for background compaction, and 2/1 for queries and other operations. | -| - | `inverted_index_compaction_enable = true` | Enable inverted index compaction to reduce CPU consumption during compaction. | -| - | `enable_segcompaction = false` `enable_ordered_data_compaction = false` | Disable two compaction features that are unnecessary for log scenarios. | -| - | `enable_compaction_priority_scheduling = false` | Low-priority compaction is limited to 2 tasks on a single disk, which can affect the speed of compaction. | -| - | `total_permits_for_compaction_score = 200000 ` | The parameter is used to control memory, under the memory time series strategy, the parameter itself can control memory. | -| Cache | `disable_storage_page_cache = true` `inverted_index_searcher_cache_limit = 30%` | Due to the large volume of log data and limited caching effect, switch from data caching to index caching. | -| - | `inverted_index_cache_stale_sweep_time_sec = 3600` `index_cache_entry_stay_time_after_lookup_s = 3600` | Maintain index caching in memory for up to 1 hour. | -| - | `enable_inverted_index_cache_on_cooldown = true`
`enable_write_index_searcher_cache = false` | Enable automatic caching of cold data storage during index uploading. | -| - | `tablet_schema_cache_recycle_interval = 3600` `segment_cache_capacity = 20000` | Reduce memory usage by other caches. | -| - | `inverted_index_ram_dir_enable = true` | Reduce the IO overhead caused by writing to index files temporarily. | -| Thread | `pipeline_executor_size = 24` `doris_scanner_thread_pool_thread_num = 48` | Configure computing threads and I/O threads for a 32-core CPU in proportion to core count. | -| - | `scan_thread_nice_value = 5` | Lower the priority of query I/O threads to ensure writing performance and timeliness. | -| Other | `string_type_length_soft_limit_bytes = 10485760` | Increase the length limit of string-type data to 10 MB. | -| - | `trash_file_expire_time_sec = 300` `path_gc_check_interval_second = 900` `path_scan_interval_second = 900` | Accelerate the recycling of trash files. | - - -For more information, refer to [BE Configuration](../../admin-manual/config/be-config.md). - -### Step 4: Create tables - -Due to the distinct characteristics of both writing and querying log data, it is recommended to configure tables with targeted settings to enhance performance. - -**Configure data partitioning and bucketing** - -- For data partitioning: - - - Enable [range partitioning](../../table-design/data-partitioning/dynamic-partitioning#range-partition) (`PARTITION BY RANGE(`ts`)`) with [dynamic partitions](../../table-design/data-partitioning/dynamic-partitioning) (`"dynamic_partition.enable" = "true"`) managed automatically by day. - - - Use a field in the DATETIME type as the key (`DUPLICATE KEY(ts)`) for accelerated retrieval of the latest N log entries. - -- For data bucketing: - - - Configure the number of buckets to be roughly three times the total number of disks in the cluster, with each bucket containing approximately 5GB of data after compression. - - - Use the Random strategy (`DISTRIBUTED BY RANDOM BUCKETS 60`) to optimize batch writing efficiency when paired with single tablet imports. - -For more information, refer to [Data Partitioning](../../table-design/data-partitioning/basic-concepts). - -**Configure compression parameters** - -Use the zstd compression algorithm ("compression" = "zstd") to improve data compression efficiency. - -**Configure compaction parameters** - -Configure compaction fields as follows: - -- Use the time_series strategy (`"compaction_policy" = "time_series"`) to reduce write amplification, which is crucial for high-throughput log writes. - -**Configure index parameters** - -Configuring index fields as follows: - -- Create indexes for fields that are frequently queried (`USING INVERTED`). - -- For fields that require full-text search, specify the parser field as unicode, which satisfies most requirements. If there is a need to support phrase queries, set the support_phrase field to true; if not needed, set it to false to reduce storage space. - -**Configure storage parameters** - -Configure storage policies as follows: - -- For storage of hot data, if using cloud storage, configure the number of data copies as 1; if using physical disks, configure the number of data copies as at least 2 (`"replication_num" = "2"`). - -- Configure the storage location for log_s3 (`CREATE RESOURCE "log_s3"`) and set the log_policy_3day policy (`CREATE STORAGE POLICY log_policy_3day`), where the data is cooled and moved to the specified storage location of log_s3 after 3 days. Refer to the code below. - -```SQL -CREATE DATABASE log_db; -USE log_db; - -CREATE RESOURCE "log_s3" -PROPERTIES -( - "type" = "s3", - "s3.endpoint" = "your_endpoint_url", - "s3.region" = "your_region", - "s3.bucket" = "your_bucket", - "s3.root.path" = "your_path", - "s3.access_key" = "your_ak", - "s3.secret_key" = "your_sk" -); - -CREATE STORAGE POLICY log_policy_3day -PROPERTIES( - "storage_resource" = "log_s3", - "cooldown_ttl" = "259200" -); - -CREATE TABLE log_table -( - `ts` DATETIME, - `host` TEXT, - `path` TEXT, - `message` TEXT, - INDEX idx_host (`host`) USING INVERTED, - INDEX idx_path (`path`) USING INVERTED, - INDEX idx_message (`message`) USING INVERTED PROPERTIES("parser" = "unicode", "support_phrase" = "true") -) -ENGINE = OLAP -DUPLICATE KEY(`ts`) -PARTITION BY RANGE(`ts`) () -DISTRIBUTED BY RANDOM BUCKETS 60 -PROPERTIES ( - "compression" = "zstd", - "compaction_policy" = "time_series", - "dynamic_partition.enable" = "true", - "dynamic_partition.create_history_partition" = "true", - "dynamic_partition.time_unit" = "DAY", - "dynamic_partition.start" = "-30", - "dynamic_partition.end" = "1", - "dynamic_partition.prefix" = "p", - "dynamic_partition.buckets" = "60", - "dynamic_partition.replication_num" = "2", -- unneccessary for the compute-storage coupled mode - "replication_num" = "2", -- unneccessary for the compute-storage coupled mode - "storage_policy" = "log_policy_3day" -- unneccessary for the compute-storage coupled mode -); -``` - -### Step 5: Collect logs - -After completing table creation, you can proceed with log collection. - -Apache Doris provides open and versatile Stream HTTP APIs, through which you can connect with popular log collectors such as Logstash, Filebeat, Kafka, and others to carry out log collection work. This section explains how to integrate these log collectors using the Stream HTTP APIs. - -**Integrating Logstash** - -Follow these steps: - -1. Download and install the Logstash Doris Output plugin. You can choose one of the following two methods: - - - [Click to download](https://apache-doris-releases.oss-accelerate.aliyuncs.com/logstash-output-doris-1.0.0.gem) and install. - - - Compile from the source code and run the following command to install: - -```markdown -./bin/logstash-plugin install logstash-output-doris-1.0.0.gem -``` - -2. Configure Logstash. Specify the following fields: - -- `logstash.yml`: Used to configure Logstash batch processing log sizes and timings for improved data writing performance. - -```Plain Text -pipeline.batch.size: 1000000 -pipeline.batch.delay: 10000 -``` - -- `logstash_demo.conf`: Used to configure the specific input path of the collected logs and the settings for output to Apache Doris. - -``` -input { - file { - path => "/path/to/your/log" - } -} - -output { - doris { - http_hosts => [ "", "", "] - user => "your_username" - password => "your_password" - db => "your_db" - table => "your_table" - - # doris stream load http headers - headers => { - "format" => "json" - "read_json_by_line" => "true" - "load_to_single_tablet" => "true" - } - - # field mapping: doris fileld name => logstash field name - # %{} to get a logstash field, [] for nested field such as [host][name] for host.name - mapping => { - "ts" => "%{@timestamp}" - "host" => "%{[host][name]}" - "path" => "%{[log][file][path]}" - "message" => "%{message}" - } - log_request => true - log_speed_interval => 10 - } -} - ``` - -3. Run Logstash according to the command below, collect logs, and output to Apache Doris. - -```shell -./bin/logstash -f logstash_demo.conf -``` - -For more information about the Logstash Doris Output plugin, see [Logstash Doris Output Plugin](../../ecosystem/logstash.md). - -**Integrating Filebeat** - -Follow these steps: - -1. Obtain the Filebeat binary file that supports output to Apache Doris. You can [click to download](https://apache-doris-releases.oss-accelerate.aliyuncs.com/filebeat-doris-1.0.0) or compile it from the Apache Doris source code. - -2. Configure Filebeat. Specify the filebeat_demo.yml field that is used to configure the specific input path of the collected logs and the settings for output to Apache Doris. - -```YAML -# input -filebeat.inputs: -- type: log -enabled: true -paths: - - /path/to/your/log -multiline: - type: pattern - pattern: '^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' - negate: true - match: after - skip_newline: true - -processors: -- script: - lang: javascript - source: > - function process(event) { - var msg = event.Get("message"); - msg = msg.replace(/\t/g, " "); - event.Put("message", msg); - } -- dissect: - # 2024-06-08 18:26:25,481 INFO (report-thread|199) [ReportHandler.cpuReport():617] begin to handle - tokenizer: "%{day} %{time} %{log_level} (%{thread}) [%{position}] %{content}" - target_prefix: "" - ignore_failure: true - overwrite_keys: true - -# queue and batch -queue.mem: -events: 1000000 -flush.min_events: 100000 -flush.timeout: 10s - -# output -output.doris: -fenodes: [ "http://fehost1:http_port", "http://fehost2:http_port", "http://fehost3:http_port" ] -user: "your_username" -password: "your_password" -database: "your_db" -table: "your_table" -# output string format -codec_format_string: '{"ts": "%{[day]} %{[time]}", "host": "%{[agent][hostname]}", "path": "%{[log][file][path]}", "message": "%{[message]}"}' -headers: - format: "json" - read_json_by_line: "true" - load_to_single_tablet: "true" -``` - -3. Run Filebeat according to the command below, collect logs, and output to Apache Doris. - - ```shell - chmod +x filebeat-doris-1.0.0 - ./filebeat-doris-1.0.0 -c filebeat_demo.yml - ``` - -For more information about Filebeat, refer to [Beats Doris Output Plugin](../../ecosystem/beats.md). - -**Integrating Kafka** - -Write JSON formatted logs to Kafka's message queue, create a Kafka Routine Load, and allow Apache Doris to actively pull data from Kafka. - -You can refer to the example below, where `property.*` represents Librdkafka client-related configurations and needs to be adjusted according to the actual Kafka cluster situation. - -```SQL -CREATE ROUTINE LOAD load_log_kafka ON log_db.log_table -COLUMNS(ts, clientip, request, status, size) -PROPERTIES ( -"max_batch_interval" = "10", -"max_batch_rows" = "1000000", -"max_batch_size" = "109715200", -"load_to_single_tablet" = "true", -"timeout" = "600", -"strict_mode" = "false", -"format" = "json" -) -FROM KAFKA ( -"kafka_broker_list" = "host:port", -"kafka_topic" = "log__topic_", -"property.group.id" = "your_group_id", -"property.security.protocol"="SASL_PLAINTEXT", -"property.sasl.mechanism"="GSSAPI", -"property.sasl.kerberos.service.name"="kafka", -"property.sasl.kerberos.keytab"="/path/to/xxx.keytab", -"property.sasl.kerberos.principal"="" -); -
SHOW ROUTINE LOAD; -``` - -For more information about Kafka, see [Routine Load](../../data-operate/import/import-way/routine-load-manual.md). - -**Using customized programs to collect logs** - -In addition to integrating common log collectors, you can also customize programs to import log data into Apache Doris using the Stream Load HTTP API. Refer to the following code: - -```shell -curl ---location-trusted --u username:password --H "format:json" --H "read_json_by_line:true" --H "load_to_single_tablet:true" --H "timeout:600" --T logfile.json -http://fe_host:fe_http_port/api/log_db/log_table/_stream_load -``` - -When using custom programs, pay attention to the following key points: - -- Use Basic Auth for HTTP authentication, calculate using the command echo -n 'username:password' | base64. - -- Set HTTP header "format:json" to specify the data format as JSON. - -- Set HTTP header "read_json_by_line:true" to specify one JSON per line. - -- Set HTTP header "load_to_single_tablet:true" to import data into one bucket at a time to reduce small file imports. - -- It is recommended to write batches whose sizes are between 100MB to 1GB on the client side. For Apache Doris version 2.1 and higher, you need to reduce batch sizes on the client side through the Group Commit function. - -### Step 6: Query and analyze logs - -**Query logs** - -Apache Doris supports standard SQL, so you can connect to the cluster through MySQL client or JDBC to execute SQL for log queries. - -```Plain Text -mysql -h fe_host -P fe_mysql_port -u your_username -Dyour_db_name -``` - -Here are 5 common SQL query commands for reference: - -- View the latest 10 log entries - -```SQL -SELECT * FROM your_table_name ORDER BY ts DESC LIMIT 10; -``` - -- Query the latest 10 log entries with the host as 8.8.8.8 - -```SQL -SELECT * FROM your_table_name WHERE host = '8.8.8.8' ORDER BY ts DESC LIMIT 10; -``` - -- Retrieve the latest 10 log entries with error or 404 in the request field. In the command below, MATCH_ANY is a full-text search SQL syntax used by Apache Doris for matching any keyword in the fields. - -```SQL -SELECT * FROM your_table_name WHERE message **MATCH_ANY** 'error 404' -ORDER BY ts DESC LIMIT 10; -``` - -- Retrieve the latest 10 log entries with image and faq in the request field. In the command below, MATCH_ALL is a full-text search SQL syntax used by Apache Doris for matching all keywords in the fields. - -```SQL -SELECT * FROM your_table_name WHERE message **MATCH_ALL** 'image faq' -ORDER BY ts DESC LIMIT 10; -``` - -- Retrieve the latest 10 entries with image and faq in the request field. In the following command, MATCH_PHRASE is a full-text search SQL syntax used by Apache Doris for matching all keywords in the fields and requiring consistent order. In the example below, a image faq b can match, but a faq image b cannot match because the order of image and faq does not match the syntax. - -```SQL -SELECT * FROM your_table_name WHERE message **MATCH_PHRASE** 'image faq' -ORDER BY ts DESC LIMIT 10; -``` - -**Analyze logs visually** - -Some third-party vendors offer visual log analysis development platforms based on Apache Doris, which include a log search and analysis interface similar to Kibana Discover. These platforms provide an intuitive and user-friendly exploratory log analysis interaction. - -![WebUI](/images/WebUI-EN.jpeg) - -- Support for full-text search and SQL modes - -- Support for selecting query log timeframes with time boxes and histograms - -- Display of detailed log information, expandable into JSON or tables - -- Interactive clicking to add and remove filter conditions in the log data context - -- Display of top field values in search results for finding anomalies and further drilling down for analysis - -Please contact dev@doris.apache.org to find more. diff --git a/versioned_docs/version-2.0/gettingStarted/tutorials/log-storage-analysis.md b/versioned_docs/version-3.0/log-storage-analysis.md similarity index 99% rename from versioned_docs/version-2.0/gettingStarted/tutorials/log-storage-analysis.md rename to versioned_docs/version-3.0/log-storage-analysis.md index 9d54040e66696..dd20c029d04b5 100644 --- a/versioned_docs/version-2.0/gettingStarted/tutorials/log-storage-analysis.md +++ b/versioned_docs/version-3.0/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "Building log analysis platform", + "title": "Log Storage and Analysis", "language": "en" } --- diff --git a/versioned_docs/version-3.0/practical-guide/log-storage-analysis.md b/versioned_docs/version-3.0/practical-guide/log-storage-analysis.md index 9d54040e66696..dd20c029d04b5 100644 --- a/versioned_docs/version-3.0/practical-guide/log-storage-analysis.md +++ b/versioned_docs/version-3.0/practical-guide/log-storage-analysis.md @@ -1,6 +1,6 @@ --- { - "title": "Building log analysis platform", + "title": "Log Storage and Analysis", "language": "en" } --- diff --git a/versioned_docs/version-3.0/table-design/data-model/aggregate.md b/versioned_docs/version-3.0/table-design/data-model/aggregate.md index 987f909395a46..20fe8b8baee68 100644 --- a/versioned_docs/version-3.0/table-design/data-model/aggregate.md +++ b/versioned_docs/version-3.0/table-design/data-model/aggregate.md @@ -1,7 +1,7 @@ --- { - "title": "聚合模型", - "language": "zh-CN" + "title": "Aggregate Model", + "language": "en" } --- diff --git a/versioned_docs/version-3.0/table-design/data-model/duplicate.md b/versioned_docs/version-3.0/table-design/data-model/duplicate.md index f7de4e87a66b4..0e79c65c38038 100644 --- a/versioned_docs/version-3.0/table-design/data-model/duplicate.md +++ b/versioned_docs/version-3.0/table-design/data-model/duplicate.md @@ -1,7 +1,7 @@ --- { - "title": "明细模型", - "language": "zh-CN" + "title": "Detail Model", + "language": "en" } --- diff --git a/versioned_docs/version-3.0/table-design/data-model/overview.md b/versioned_docs/version-3.0/table-design/data-model/overview.md index e2578e8c6d7d7..d1de425396593 100644 --- a/versioned_docs/version-3.0/table-design/data-model/overview.md +++ b/versioned_docs/version-3.0/table-design/data-model/overview.md @@ -1,7 +1,7 @@ --- { - "title": "模型概述", - "language": "zh-CN" + "title": "Table Model Overview", + "language": "en" } --- @@ -34,7 +34,7 @@ Doris supports three types of table models: * **Primary Key Model (Unique Key Model)**: Ensures that each row has a unique Key value, and guarantees that there are no duplicate rows for a given Key column. The Doris storage layer retains only the latest written data for each key, making this model suitable for scenarios that involve data updates. -* **Aggregation Model (Aggregate Key Model)**: Allows data to be aggregated based on the Key columns. The Doris storage layer retains aggregated data, reducing storage space and improving query performance. This model is typically used in situations where summary or aggregated information (such as totals or averages) is required. +* **Aggregate Model (Aggregate Key Model)**: Allows data to be aggregated based on the Key columns. The Doris storage layer retains aggregated data, reducing storage space and improving query performance. This model is typically used in situations where summary or aggregated information (such as totals or averages) is required. Once the table is created, the table model attributes are confirmed and cannot be modified. It is crucial to choose the appropriate model based on business requirements: diff --git a/versioned_docs/version-3.0/table-design/data-model/unique.md b/versioned_docs/version-3.0/table-design/data-model/unique.md index 52e8ae1e11536..e80e983875cb3 100644 --- a/versioned_docs/version-3.0/table-design/data-model/unique.md +++ b/versioned_docs/version-3.0/table-design/data-model/unique.md @@ -1,7 +1,7 @@ --- { - "title": "主键模型", - "language": "zh-CN" + "title": "Primary Key Model", + "language": "en" } --- diff --git a/versioned_docs/version-3.0/table-design/schema-change.md b/versioned_docs/version-3.0/table-design/schema-change.md index 3abc3aa5ce077..d8540d0b1ae44 100644 --- a/versioned_docs/version-3.0/table-design/schema-change.md +++ b/versioned_docs/version-3.0/table-design/schema-change.md @@ -1,6 +1,6 @@ --- { - "title": "Schema Evolution", + "title": "Schema Change", "language": "en" } --- diff --git a/versioned_docs/version-3.0/table-design/tiered-storage/overview.md b/versioned_docs/version-3.0/table-design/tiered-storage/overview.md index 6a7d3af05a336..f9003e67139d4 100644 --- a/versioned_docs/version-3.0/table-design/tiered-storage/overview.md +++ b/versioned_docs/version-3.0/table-design/tiered-storage/overview.md @@ -1,6 +1,6 @@ --- { - "title": "Tiered Storage", + "title": "Tiered Storage Overview", "language": "en-US" } --- diff --git a/versioned_sidebars/version-1.2-sidebars.json b/versioned_sidebars/version-1.2-sidebars.json index c7c8611bb989f..4fac568544e51 100644 --- a/versioned_sidebars/version-1.2-sidebars.json +++ b/versioned_sidebars/version-1.2-sidebars.json @@ -5,26 +5,8 @@ "label": "Getting Started", "collapsed": false, "items": [ - "gettingStarted/what-is-new", "gettingStarted/what-is-apache-doris", - "gettingStarted/quick-start", - { - "type": "category", - "label": "Tutorials", - "items": [ - { - "type": "category", - "label": "Building lakehouse", - "items": [ - "gettingStarted/tutorials/building-lakehouse/doris-hudi", - "gettingStarted/tutorials/building-lakehouse/doris-paimon", - "gettingStarted/tutorials/building-lakehouse/doris-iceberg", - "gettingStarted/tutorials/building-lakehouse/doris-lakesoul" - ] - }, - "gettingStarted/tutorials/log-storage-analysis" - ] - } + "gettingStarted/quick-start" ] }, { diff --git a/versioned_sidebars/version-2.0-sidebars.json b/versioned_sidebars/version-2.0-sidebars.json index e38afeb61193d..aa492229935fa 100644 --- a/versioned_sidebars/version-2.0-sidebars.json +++ b/versioned_sidebars/version-2.0-sidebars.json @@ -5,26 +5,8 @@ "label": "Getting Started", "collapsed": false, "items": [ - "gettingStarted/what-is-new", "gettingStarted/what-is-apache-doris", - "gettingStarted/quick-start", - { - "type": "category", - "label": "Tutorials", - "items": [ - { - "type": "category", - "label": "Building lakehouse", - "items": [ - "gettingStarted/tutorials/building-lakehouse/doris-hudi", - "gettingStarted/tutorials/building-lakehouse/doris-paimon", - "gettingStarted/tutorials/building-lakehouse/doris-iceberg", - "gettingStarted/tutorials/building-lakehouse/doris-lakesoul" - ] - }, - "gettingStarted/tutorials/log-storage-analysis" - ] - } + "gettingStarted/quick-start" ] }, { diff --git a/versioned_sidebars/version-2.1-sidebars.json b/versioned_sidebars/version-2.1-sidebars.json index 3937d01b06286..5f0c8f9af6872 100644 --- a/versioned_sidebars/version-2.1-sidebars.json +++ b/versioned_sidebars/version-2.1-sidebars.json @@ -5,16 +5,8 @@ "label": "Getting Started", "collapsed": false, "items": [ - "gettingStarted/what-is-new", "gettingStarted/what-is-apache-doris", - "gettingStarted/quick-start", - { - "type": "category", - "label": "Tutorials", - "items": [ - "gettingStarted/tutorials/log-storage-analysis" - ] - } + "gettingStarted/quick-start" ] }, { @@ -35,33 +27,21 @@ "install/preparation/os-checking" ] }, - { - "type": "category", - "label": "Cluster Deployment Manually", - "items": [ - "install/deploy-manually/storage-compute-coupled-deploy-manually" - ] - }, + "install/deploy-manually/storage-compute-coupled-deploy-manually", { "type": "category", "label": "Deploying on Kubernetes", "items": [ - { - "type": "category", - "label": "Compute storage coupled", - "items": [ - "install/cluster-deployment/k8s-deploy/install-doris-operator", - "install/cluster-deployment/k8s-deploy/install-config-cluster", - "install/cluster-deployment/k8s-deploy/install-doris-cluster", - "install/cluster-deployment/k8s-deploy/access-cluster", - "install/cluster-deployment/k8s-deploy/cluster-operation" - ] - } + "install/cluster-deployment/k8s-deploy/install-doris-operator", + "install/cluster-deployment/k8s-deploy/install-config-cluster", + "install/cluster-deployment/k8s-deploy/install-doris-cluster", + "install/cluster-deployment/k8s-deploy/access-cluster", + "install/cluster-deployment/k8s-deploy/cluster-operation" ] }, { "type": "category", - "label": "Deployment on Cloud", + "label": "Deploying on Cloud", "items": [ "install/cluster-deployment/doris-on-aws" ] @@ -148,6 +128,7 @@ "data-operate/import/data-source/hdfs", "data-operate/import/data-source/amazon-s3", "data-operate/import/data-source/google-cloud-storage", + "data-operate/import/data-source/azure-storage", "data-operate/import/data-source/aliyun-oss", "data-operate/import/data-source/huawei-obs", "data-operate/import/data-source/tencent-cos", @@ -418,6 +399,7 @@ } ] }, + "log-storage-analysis", { "type": "category", "label": "Security", @@ -445,6 +427,16 @@ } ] }, + { + "type": "category", + "label": "Benchmark", + "collapsed": false, + "items": [ + "benchmark/ssb", + "benchmark/tpch", + "benchmark/tpcds" + ] + }, { "type": "category", "label": "Management", @@ -746,16 +738,6 @@ "admin-manual/compaction" ] }, - { - "type": "category", - "label": "Benchmark", - "collapsed": false, - "items": [ - "benchmark/ssb", - "benchmark/tpch", - "benchmark/tpcds" - ] - }, { "type": "category", "label": "Ecosystem", diff --git a/versioned_sidebars/version-3.0-sidebars.json b/versioned_sidebars/version-3.0-sidebars.json index c78dcf40ffee7..c27304b995aa5 100644 --- a/versioned_sidebars/version-3.0-sidebars.json +++ b/versioned_sidebars/version-3.0-sidebars.json @@ -5,16 +5,8 @@ "label": "Getting Started", "collapsed": false, "items": [ - "gettingStarted/what-is-new", "gettingStarted/what-is-apache-doris", - "gettingStarted/quick-start", - { - "type": "category", - "label": "Tutorials", - "items": [ - "gettingStarted/tutorials/log-storage-analysis" - ] - } + "gettingStarted/quick-start" ] }, { @@ -73,7 +65,7 @@ }, { "type": "category", - "label": "Deployment on Cloud", + "label": "Deploying on Cloud", "items": [ "install/cluster-deployment/doris-on-aws" ] @@ -160,6 +152,7 @@ "data-operate/import/data-source/hdfs", "data-operate/import/data-source/amazon-s3", "data-operate/import/data-source/google-cloud-storage", + "data-operate/import/data-source/azure-storage", "data-operate/import/data-source/aliyun-oss", "data-operate/import/data-source/huawei-obs", "data-operate/import/data-source/tencent-cos", @@ -436,6 +429,7 @@ } ] }, + "log-storage-analysis", { "type": "category", "label": "Compute-Storage Decoupled", @@ -477,6 +471,16 @@ } ] }, + { + "type": "category", + "label": "Benchmark", + "collapsed": false, + "items": [ + "benchmark/ssb", + "benchmark/tpch", + "benchmark/tpcds" + ] + }, { "type": "category", "label": "Management", @@ -779,16 +783,6 @@ "admin-manual/compaction" ] }, - { - "type": "category", - "label": "Benchmark", - "collapsed": false, - "items": [ - "benchmark/ssb", - "benchmark/tpch", - "benchmark/tpcds" - ] - }, { "type": "category", "label": "Ecosystem",