From 031528c0a419a3f8074e816ab9630c12b5a06d66 Mon Sep 17 00:00:00 2001 From: discord9 Date: Mon, 2 Sep 2024 17:40:57 +0800 Subject: [PATCH] refactor: per review --- .../continuous-aggregation/overview.md | 7 +- .../continuous-aggregation/query.md | 105 ------------------ .../continuous-aggregation/usecase-example.md | 31 +----- sidebars.ts | 2 +- 4 files changed, 11 insertions(+), 134 deletions(-) delete mode 100644 docs/user-guide/continuous-aggregation/query.md diff --git a/docs/user-guide/continuous-aggregation/overview.md b/docs/user-guide/continuous-aggregation/overview.md index 1826fde87..3bc3fc11b 100644 --- a/docs/user-guide/continuous-aggregation/overview.md +++ b/docs/user-guide/continuous-aggregation/overview.md @@ -64,7 +64,7 @@ SELECT min(size) as min_size, max(size) as max_size, avg(size) as avg_size, - sum(case when `size` > 550::double then 1::double else 0::double end) as high_size_count, + sum(case when `size` > 550 then 1 else 0 end) as high_size_count, date_bin(INTERVAL '1 minutes', access_time) as time_window, FROM ngx_access_log GROUP BY @@ -133,14 +133,15 @@ Here is the explanation of the columns in the `ngx_statistics` table: - `time_window`: The time window of the aggregation. - `update_at`: The time when the aggregation is updated. -NOTE: if you don't manually create sink table, the Flow engine will automatically create it for you based on the query(i.e. using columns in `GROUP BY` as primary tags and time index), however, sometimes you may want to create the sink table manually to have more control over the schema. + + ## Next Steps Congratulations you already have a preliminary understanding of the continuous aggregation feature. Please refer to the following sections to learn more: +- [Usecase Example](./usecase-example.md) provides more examples of how to use continuous aggregation in real-time analytics, monitoring, and dashboard. - [Manage Flows](./manage-flow.md) describes how to create, update, and delete a flow. Each of your continuous aggregation query is a flow. -- [Write a Query](./query.md) describes how to write a continuous aggregation query. - [Define Time Window](./define-time-window.md) describes how to define the time window for the continuous aggregation. Time window is an important attribute of your continuous aggregation query. It defines the time interval for the aggregation. - [Expression](./expression.md) is a reference of available expressions in the continuous aggregation query. diff --git a/docs/user-guide/continuous-aggregation/query.md b/docs/user-guide/continuous-aggregation/query.md deleted file mode 100644 index b9e574f0b..000000000 --- a/docs/user-guide/continuous-aggregation/query.md +++ /dev/null @@ -1,105 +0,0 @@ -# Write a Query - -This chapter describes how to write a continuous aggregation query in GreptimeDB. Query here should be a `SELECT` statement with either aggregating functions or non-aggregating functions (i.e., scalar function). - -Generally speaking, the `SQL` part in the flow is just like a normal `SELECT` clause with a few difference. -The grammar of the query is like the following: - -```sql -SELECT AGGR_FUNCTION(column1, column2,..) FROM GROUP BY TIME_WINDOW_FUNCTION(); -``` - -Only two kinds of expression are allowed after `SELECT` keyword: -- Aggregate functions: see the reference in [Expression](./expression.md) for detail. -- Scalar functions: like `col`, `to_lowercase(col)`, `col + 1`, etc. This part is the same as the normal `SELECT` clause in GreptimeDB. - -The query should have a `FROM` clause to identify the source table. As the join clause is currently not supported, the query can only aggregate columns from a single table. - -`GROUP BY` clause works as in a normal query. It groups the data by the specified columns. One special thing is the time window functions `hop()` and `tumble()` described in [Define Time Window](./define-time-window.md) part. They are used in the `GROUP BY` clause to define the time window for the aggregation. Other expressions in `GROUP BY` can be either literal, column or scalar expressions. - -Others things like `ORDER BY`, `LIMIT`, `OFFSET` are not supported in the continuous aggregation query. - -## Rewrite an existing query to a continuous aggregation query - -Some of simple existing aggregation queries can be directly used as continuous aggregation queries. For example, the example in [overview](./overview.md) can be used to query both in standard SQL and continuous aggregation query, since it's also a valid SQL query without any flow-specific syntax or functions: - -```sql -SELECT - status, - count(client) AS total_logs, - min(size) as min_size, - max(size) as max_size, - avg(size) as avg_size, - sum(case when `size` > 550::double then 1::double else 0::double end) as high_size_count, - date_bin(INTERVAL '1 minutes', access_time) as time_window, -FROM ngx_access_log -GROUP BY - status, - time_window; -``` - -However, there are other types of queries that cannot be directly used as continuous aggregation queries. -For example, a query that needs to compute percentiles would be unwise to repeatedly calculate the percentile for each time window everytime a new batch of data arrive. In this case, you can pre-aggregate the data into buckets of the desired size, and then calculate the percentile in the sink table using standard SQL when needed. The original SQL might be: -```sql -SELECT - status, - percentile_approx(size, 0.5) as median_size, - date_bin(INTERVAL '1 minutes', access_time) as time_window, -FROM ngx_access_log -GROUP BY - status, - time_window; -``` -The above query can be rewritten to first aggregate the data into buckets of size 10, and then calculate the percentile in the sink table. -The flow query would be: -```sql -CREATE FLOW calc_ngx_distribution -SINK TO ngx_distribution -AS -SELECT - status, - trunc(size, -1) as bucket, - count(client) AS total_logs, - date_bin(INTERVAL '1 minutes', access_time) as time_window, -FROM ngx_access_log -GROUP BY - status, - time_window, - bucket; -``` - -And then you can calculate the percentile in the sink table using standard SQL: -```sql -SELECT - outer.status, - outer.time_window, - outer.bucket, - SUM(case when in1.bucket <= outer.bucket then in1.total_logs else 0 end) * 100 / SUM(in1.total_logs) AS percentile -FROM ngx_distribution AS outer -JOIN ngx_distribution AS in1 -ON in1.status = outer.status -AND in1.time_window = outer.time_window -GROUP BY - status, - time_window, - bucket -ORDER BY status, time_window, bucket; -``` - -The SQL query groups the data by status, time_window, and bucket. The percentile column calculates the percentage within each group by taking the sum of all buckets not greater than the current bucket and dividing it by the total count of all logs. The result would be something like this: - -```sql - status | time_window | bucket | percentile ---------+----------------------------+--------+------------ - 404 | 1970-01-01 00:00:00.000000 | 0 | 22 - 404 | 1970-01-01 00:00:00.000000 | 1 | 55 - 404 | 1970-01-01 00:00:00.000000 | 2 | 66 - 404 | 1970-01-01 00:00:00.000000 | 3 | 100 -(4 rows) -``` - - \ No newline at end of file diff --git a/docs/user-guide/continuous-aggregation/usecase-example.md b/docs/user-guide/continuous-aggregation/usecase-example.md index 2b28512eb..52f1ea345 100644 --- a/docs/user-guide/continuous-aggregation/usecase-example.md +++ b/docs/user-guide/continuous-aggregation/usecase-example.md @@ -1,4 +1,4 @@ -# Usecase Example +# Usecase Examples Following are three major usecase examples for continuous aggregation: 1. **Real-time Analytics**: A real-time analytics platform that continuously aggregates data from a stream of events, delivering immediate insights while optionally downsampling the data to a lower resolution. For instance, this system can compile data from a high-frequency stream of log events (e.g., occurring every millisecond) to provide up-to-the-minute insights such as the number of requests per minute, average response times, and error rates per minute. @@ -9,30 +9,11 @@ Following are three major usecase examples for continuous aggregation: In all these usecases, the continuous aggregation system continuously aggregates data from a stream of events and provides real-time insights and alerts based on the aggregated data. The system can also downsample the data to a lower resolution to reduce the amount of data stored and processed. This allows the system to provide real-time insights and alerts while keeping the data storage and processing costs low. -## Real-time Analytics Example +## Real-time analytics example -Consider a usecase where you have a stream of log events from a web server that you want to analyze in real-time. The log events contain information such as the status of the request, the size of the response, the client IP address, and the timestamp of the request. You want to continuously aggregate this data to provide real-time analytics on the number of requests per minute, the min/max/average packet size, and the error rate per minute. Then the query for continuous aggregation would be: +See [Overview](overview.md) for an example of real-time analytics. Which is to calculate the total number of logs, the minimum size, the maximum size, the average size, and the number of packets with the size greater than 550 for each status code in a 1-minute fixed window for access logs. -```sql -CREATE FLOW ngx_aggregation -SINK TO ngx_statistics -AS -SELECT - status, - count(client) AS total_logs, - sum(case when status >= 400 then 1 end) as error_logs, - min(size) as min_size, - max(size) as max_size, - avg(size) as avg_size -FROM ngx_access_log -GROUP BY - status, - date_bin(INTERVAL '1 minutes', access_time, '2024-01-01 00:00:00'::Timestamp); -``` - -The above query continuously aggregates the data from the `ngx_access_log` table into the `ngx_statistics` table. It calculates the total number of logs, the number of error logs, the min/max/average packet size, and the error rate per minute. The `date_bin` function is used to group the data into one-minute intervals. The `ngx_statistics` table will be continuously updated with the aggregated data, providing real-time insights into the web server's performance. - -## Real-time Monitoring Example +## Real-time monitoring example Consider a usecase where you have a stream of sensor events from a network of temperature sensors that you want to monitor in real-time. The sensor events contain information such as the sensor ID, the temperature reading, the timestamp of the reading, and the location of the sensor. You want to continuously aggregate this data to provide real-time alerts when the temperature exceeds a certain threshold. Then the query for continuous aggregation would be: @@ -61,7 +42,7 @@ HAVING max_temp > 100; The above query continuously aggregates the data from the `temp_sensor_data` table into the `temp_alerts` table. It calculates the maximum temperature reading for each sensor and location and filters out the data where the maximum temperature exceeds 100 degrees. The `temp_alerts` table will be continuously updated with the aggregated data, providing real-time alerts (Which is a new row in the `temp_alerts` table) when the temperature exceeds the threshold. -## Real-time Dashboard +## Real-time dashboard Consider a usecase in which you need a bar graph that show the distribution of packet sizes for each status code to monitor the health of the system. The query for continuous aggregation would be: @@ -83,6 +64,6 @@ GROUP BY The above query puts the data from the `ngx_access_log` table into the `ngx_distribution` table. It calculates the total number of logs for each status code and packet size bucket (in this case, since `trunc`'s second argument is -1, meaning a bucket size of 10) for each time window. The `date_bin` function is used to group the data into one-minute intervals. The `ngx_distribution` table will be continuously updated with the aggregated data, providing real-time insights into the distribution of packet sizes for each status code. -# Conclusion +## Conclusion Continuous aggregation is a powerful tool for real-time analytics, monitoring, and dashboarding. It allows you to continuously aggregate data from a stream of events and provide real-time insights and alerts based on the aggregated data. By downsampling the data to a lower resolution, you can reduce the amount of data stored and processed, making it easier to provide real-time insights and alerts while keeping the data storage and processing costs low. Continuous aggregation is a key component of any real-time data processing system and can be used in a wide range of usecases to provide real-time insights and alerts based on streaming data. \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index 9217d2ae4..cdcdf10b4 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -118,8 +118,8 @@ const sidebars: SidebarsConfig = { label: 'Continuous Aggregation', items: [ 'user-guide/continuous-aggregation/overview', + 'user-guide/continuous-aggregation/usecase-example', 'user-guide/continuous-aggregation/manage-flow', - 'user-guide/continuous-aggregation/query', 'user-guide/continuous-aggregation/define-time-window', 'user-guide/continuous-aggregation/expression', ],