From aa336ab9efe8b07895b3e7148e5ad9c3b97b9ffe Mon Sep 17 00:00:00 2001 From: bitliu Date: Fri, 20 Oct 2023 16:17:22 +0800 Subject: [PATCH 1/4] design(docs/api): control plane metrics monitoring Signed-off-by: bitliu --- api/v1alpha1/envoygateway_helpers.go | 77 +++++- api/v1alpha1/envoygateway_metrics_types.go | 65 +++++ api/v1alpha1/envoygateway_types.go | 17 ++ ...ic_types.go => envoyproxy_metric_types.go} | 58 ++-- .../validation/envoygateway_validate.go | 10 + .../validation/envoygateway_validate_test.go | 41 +++ .../validation/envoyproxy_validate.go | 11 + .../validation/envoyproxy_validate_test.go | 44 +++ api/v1alpha1/zz_generated.deepcopy.go | 226 +++++++++++---- .../proxy/resource_provider_test.go | 4 +- internal/xds/bootstrap/bootstrap_test.go | 10 +- site/content/en/latest/api/extension_types.md | 188 +++++++++---- site/content/en/latest/design/eg-metrics.md | 259 ++++++++++++++++++ .../content/en/latest/install/install-helm.md | 6 +- 14 files changed, 874 insertions(+), 142 deletions(-) create mode 100644 api/v1alpha1/envoygateway_metrics_types.go rename api/v1alpha1/{metric_types.go => envoyproxy_metric_types.go} (90%) create mode 100644 site/content/en/latest/design/eg-metrics.md diff --git a/api/v1alpha1/envoygateway_helpers.go b/api/v1alpha1/envoygateway_helpers.go index d9673a0cd2a..cbb615b922f 100644 --- a/api/v1alpha1/envoygateway_helpers.go +++ b/api/v1alpha1/envoygateway_helpers.go @@ -19,10 +19,11 @@ func DefaultEnvoyGateway() *EnvoyGateway { APIVersion: GroupVersion.String(), }, EnvoyGatewaySpec{ - Gateway: DefaultGateway(), - Provider: DefaultEnvoyGatewayProvider(), - Logging: DefaultEnvoyGatewayLogging(), - Admin: DefaultEnvoyGatewayAdmin(), + Gateway: DefaultGateway(), + Provider: DefaultEnvoyGatewayProvider(), + Logging: DefaultEnvoyGatewayLogging(), + Admin: DefaultEnvoyGatewayAdmin(), + Telemetry: DefaultEnvoyGatewayTelemetry(), }, } } @@ -47,6 +48,9 @@ func (e *EnvoyGateway) SetEnvoyGatewayDefaults() { if e.Admin == nil { e.Admin = DefaultEnvoyGatewayAdmin() } + if e.Telemetry == nil { + e.Telemetry = DefaultEnvoyGatewayTelemetry() + } } // GetEnvoyGatewayAdmin returns the EnvoyGatewayAdmin of EnvoyGateway or a default EnvoyGatewayAdmin if unspecified. @@ -88,6 +92,71 @@ func DefaultEnvoyGatewayLogging() *EnvoyGatewayLogging { } } +// GetEnvoyGatewayAdmin returns the EnvoyGatewayAdmin of EnvoyGateway or a default EnvoyGatewayAdmin if unspecified. +func (e *EnvoyGateway) GetEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { + if e.Telemetry != nil { + if e.Telemetry.Metrics.Prometheus == nil { + e.Telemetry.Metrics.Prometheus = DefaultEnvoyGatewayPrometheus() + } + if e.Telemetry.Metrics.Address == nil { + e.Telemetry.Metrics.Address = DefaultEnvoyGatewayMetricsAddress() + } + if e.Telemetry.Metrics == nil { + e.Telemetry.Metrics = DefaultEnvoyGatewayMetrics() + } + return e.Telemetry + } + e.Telemetry = DefaultEnvoyGatewayTelemetry() + + return e.Telemetry +} + +// GetEnvoyGatewayMetricsAddress returns the EnvoyGateway Metrics Address. +func (e *EnvoyGateway) GetEnvoyGatewayMetricsAddress() string { + address := e.GetEnvoyGatewayTelemetry().Metrics.Address + if address != nil { + return fmt.Sprintf("%s:%d", address.Host, address.Port) + } + + return "" +} + +// IfDisablePrometheus returns if disable prometheus. +func (e *EnvoyGateway) IfDisablePrometheus() bool { + return e.GetEnvoyGatewayTelemetry().Metrics.Prometheus.Disable +} + +// DefaultEnvoyGatewayTelemetry returns a new EnvoyGatewayTelemetry with default configuration parameters. +func DefaultEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { + return &EnvoyGatewayTelemetry{ + Metrics: DefaultEnvoyGatewayMetrics(), + } +} + +// DefaultEnvoyGatewayMetrics returns a new EnvoyGatewayMetrics with default configuration parameters. +func DefaultEnvoyGatewayMetrics() *EnvoyGatewayMetrics { + return &EnvoyGatewayMetrics{ + Address: DefaultEnvoyGatewayMetricsAddress(), + Prometheus: DefaultEnvoyGatewayPrometheus(), + } +} + +// DefaultEnvoyGatewayMetricsAddress returns a new EnvoyGatewayMetrics with default configuration parameters. +func DefaultEnvoyGatewayMetricsAddress() *EnvoyGatewayMetricsAddress { + return &EnvoyGatewayMetricsAddress{ + Host: GatewayMetricsHost, + Port: GatewayMetricsPort, + } +} + +// DefaultEnvoyGatewayPrometheus returns a new EnvoyGatewayMetrics with default configuration parameters. +func DefaultEnvoyGatewayPrometheus() *EnvoyGatewayPrometheusProvider { + return &EnvoyGatewayPrometheusProvider{ + // Enable prometheus pull by default. + Disable: false, + } +} + // DefaultEnvoyGatewayProvider returns a new EnvoyGatewayProvider with default configuration parameters. func DefaultEnvoyGatewayProvider() *EnvoyGatewayProvider { return &EnvoyGatewayProvider{ diff --git a/api/v1alpha1/envoygateway_metrics_types.go b/api/v1alpha1/envoygateway_metrics_types.go new file mode 100644 index 00000000000..96fa3f4c587 --- /dev/null +++ b/api/v1alpha1/envoygateway_metrics_types.go @@ -0,0 +1,65 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package v1alpha1 + +// EnvoyGatewayMetrics defines control plane push/pull metrics configurations. +type EnvoyGatewayMetrics struct { + // Address defines the address of Envoy Gateway Metrics Server. + Address *EnvoyGatewayMetricsAddress + // Sinks defines the metric sinks where metrics are sent to. + Sinks []EnvoyGatewayMetricSink `json:"sinks,omitempty"` + // Prometheus defines the configuration for prometheus endpoint. + Prometheus *EnvoyGatewayPrometheusProvider `json:"prometheus,omitempty"` +} + +// EnvoyGatewayMetricSink defines control plane +// metric sinks where metrics are sent to. +type EnvoyGatewayMetricSink struct { + // Type defines the metric sink type. + // EG control plane currently supports OpenTelemetry. + // +kubebuilder:validation:Enum=OpenTelemetry + // +kubebuilder:default=OpenTelemetry + Type MetricSinkType `json:"type"` + // OpenTelemetry defines the configuration for OpenTelemetry sink. + // It's required if the sink type is OpenTelemetry. + OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` +} + +type EnvoyGatewayOpenTelemetrySink struct { + // Host define the sink service hostname. + Host string `json:"host"` + // Protocol define the sink service protocol. + // +kubebuilder:validation:Enum=grpc;http + Protocol string `json:"protocol"` + // Port defines the port the sink service is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=4317 + Port int32 `json:"port,omitempty"` +} + +// EnvoyGatewayPrometheusProvider will expose prometheus endpoint in pull mode. +type EnvoyGatewayPrometheusProvider struct { + // Disable defines if disables the prometheus metrics in pull mode. + // + Disable bool `json:"disable,omitempty"` +} + +// EnvoyGatewayMetricsAddress defines the Envoy Gateway Metrics Address configuration. +type EnvoyGatewayMetricsAddress struct { + // Port defines the port the metrics server is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=19001 + Port int `json:"port,omitempty"` + // Host defines the metrics server hostname. + // + // +optional + // +kubebuilder:default="0.0.0.0" + Host string `json:"host,omitempty"` +} diff --git a/api/v1alpha1/envoygateway_types.go b/api/v1alpha1/envoygateway_types.go index efdf826eae5..bfbf4101e50 100644 --- a/api/v1alpha1/envoygateway_types.go +++ b/api/v1alpha1/envoygateway_types.go @@ -19,6 +19,10 @@ const ( GatewayAdminPort = 19000 // GatewayAdminHost is the host of envoy gateway admin server. GatewayAdminHost = "127.0.0.1" + // GatewayMetricsPort is the port which envoy gateway metrics server is listening on. + GatewayMetricsPort = 19001 + // GatewayMetricsHost is the host of envoy gateway metrics server. + GatewayMetricsHost = "0.0.0.0" ) // +kubebuilder:object:root=true @@ -59,6 +63,12 @@ type EnvoyGatewaySpec struct { // +optional Admin *EnvoyGatewayAdmin `json:"admin,omitempty"` + // Telemetry defines the desired control plane telemetry related abilities. + // If unspecified, the telemetry is used with default configuration. + // + // +optional + Telemetry *EnvoyGatewayTelemetry `json:"telemetry,omitempty"` + // RateLimit defines the configuration associated with the Rate Limit service // deployed by Envoy Gateway required to implement the Global Rate limiting // functionality. The specific rate limit service used here is the reference @@ -80,6 +90,13 @@ type EnvoyGatewaySpec struct { ExtensionAPIs *ExtensionAPISettings `json:"extensionApis,omitempty"` } +// EnvoyGatewayTelemetry defines telemetry configurations for envoy gateway control plane. +// Control plane will focus on metrics observability telemetry and tracing telemetry later. +type EnvoyGatewayTelemetry struct { + // Metrics defines metrics configuration for envoy gateway. + Metrics *EnvoyGatewayMetrics `json:"metrics,omitempty"` +} + // EnvoyGatewayLogging defines logging for Envoy Gateway. type EnvoyGatewayLogging struct { // Level is the logging level. If unspecified, defaults to "info". diff --git a/api/v1alpha1/metric_types.go b/api/v1alpha1/envoyproxy_metric_types.go similarity index 90% rename from api/v1alpha1/metric_types.go rename to api/v1alpha1/envoyproxy_metric_types.go index b3bf977a761..170a1686b00 100644 --- a/api/v1alpha1/metric_types.go +++ b/api/v1alpha1/envoyproxy_metric_types.go @@ -5,11 +5,17 @@ package v1alpha1 +type MetricSinkType string + +const ( + MetricSinkTypeOpenTelemetry MetricSinkType = "OpenTelemetry" +) + type ProxyMetrics struct { // Prometheus defines the configuration for Admin endpoint `/stats/prometheus`. - Prometheus *PrometheusProvider `json:"prometheus,omitempty"` + Prometheus *ProxyPrometheusProvider `json:"prometheus,omitempty"` // Sinks defines the metric sinks where metrics are sent to. - Sinks []MetricSink `json:"sinks,omitempty"` + Sinks []ProxyMetricSink `json:"sinks,omitempty"` // Matches defines configuration for selecting specific metrics instead of generating all metrics stats // that are enabled by default. This helps reduce CPU and memory overhead in Envoy, but eliminating some stats // may after critical functionality. Here are the stats that we strongly recommend not disabling: @@ -23,13 +29,7 @@ type ProxyMetrics struct { EnableVirtualHostStats bool `json:"enableVirtualHostStats,omitempty"` } -type MetricSinkType string - -const ( - MetricSinkTypeOpenTelemetry MetricSinkType = "OpenTelemetry" -) - -type MetricSink struct { +type ProxyMetricSink struct { // Type defines the metric sink type. // EG currently only supports OpenTelemetry. // +kubebuilder:validation:Enum=OpenTelemetry @@ -37,27 +37,10 @@ type MetricSink struct { Type MetricSinkType `json:"type"` // OpenTelemetry defines the configuration for OpenTelemetry sink. // It's required if the sink type is OpenTelemetry. - OpenTelemetry *OpenTelemetrySink `json:"openTelemetry,omitempty"` + OpenTelemetry *ProxyOpenTelemetrySink `json:"openTelemetry,omitempty"` } -// Match defines the stats match configuration. -type Match struct { - // MatcherType defines the stats matcher type - // - // +kubebuilder:validation:Enum=RegularExpression;Prefix;Suffix - Type MatcherType `json:"type"` - Value string `json:"value"` -} - -type MatcherType string - -const ( - Prefix MatcherType = "Prefix" - RegularExpression MatcherType = "RegularExpression" - Suffix MatcherType = "Suffix" -) - -type OpenTelemetrySink struct { +type ProxyOpenTelemetrySink struct { // Host define the service hostname. Host string `json:"host"` // Port defines the port the service is exposed on. @@ -71,7 +54,24 @@ type OpenTelemetrySink struct { // TODO: add support for customizing OpenTelemetry sink in https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/stat_sinks/open_telemetry/v3/open_telemetry.proto#envoy-v3-api-msg-extensions-stat-sinks-open-telemetry-v3-sinkconfig } -type PrometheusProvider struct { +type ProxyPrometheusProvider struct { // Disable the Prometheus endpoint. Disable bool `json:"disable,omitempty"` } + +// Match defines the stats match configuration. +type Match struct { + // MatcherType defines the stats matcher type + // + // +kubebuilder:validation:Enum=RegularExpression;Prefix;Suffix + Type MatcherType `json:"type"` + Value string `json:"value"` +} + +type MatcherType string + +const ( + Prefix MatcherType = "Prefix" + RegularExpression MatcherType = "RegularExpression" + Suffix MatcherType = "Suffix" +) diff --git a/api/v1alpha1/validation/envoygateway_validate.go b/api/v1alpha1/validation/envoygateway_validate.go index 1f254696ef2..657cb9ee73d 100644 --- a/api/v1alpha1/validation/envoygateway_validate.go +++ b/api/v1alpha1/validation/envoygateway_validate.go @@ -82,6 +82,16 @@ func ValidateEnvoyGateway(eg *v1alpha1.EnvoyGateway) error { return fmt.Errorf("unsupported extension server TLS certificateRef %v", certificateRefKind) } } + case eg.Telemetry != nil: + if eg.Telemetry.Metrics != nil { + for _, sink := range eg.Telemetry.Metrics.Sinks { + if sink.Type == v1alpha1.MetricSinkTypeOpenTelemetry { + if sink.OpenTelemetry == nil { + return fmt.Errorf("OpenTelemetry is required when sink Type is OpenTelemetry") + } + } + } + } } return nil } diff --git a/api/v1alpha1/validation/envoygateway_validate_test.go b/api/v1alpha1/validation/envoygateway_validate_test.go index a88caf5d517..06a3043be12 100644 --- a/api/v1alpha1/validation/envoygateway_validate_test.go +++ b/api/v1alpha1/validation/envoygateway_validate_test.go @@ -413,6 +413,47 @@ func TestValidateEnvoyGateway(t *testing.T) { }, }, expect: false, + }, { + name: "valid gateway metrics sink", + eg: &v1alpha1.EnvoyGateway{ + EnvoyGatewaySpec: v1alpha1.EnvoyGatewaySpec{ + Gateway: v1alpha1.DefaultGateway(), + Provider: v1alpha1.DefaultEnvoyGatewayProvider(), + Telemetry: &v1alpha1.EnvoyGatewayTelemetry{ + Metrics: &v1alpha1.EnvoyGatewayMetrics{ + Sinks: []v1alpha1.EnvoyGatewayMetricSink{ + { + Type: v1alpha1.MetricSinkTypeOpenTelemetry, + OpenTelemetry: &v1alpha1.EnvoyGatewayOpenTelemetrySink{ + Host: "x.x.x.x", + Port: 4317, + Protocol: "grpc", + }, + }, + }, + }, + }, + }, + }, + expect: true, + }, { + name: "invalid gateway metrics sink", + eg: &v1alpha1.EnvoyGateway{ + EnvoyGatewaySpec: v1alpha1.EnvoyGatewaySpec{ + Gateway: v1alpha1.DefaultGateway(), + Provider: v1alpha1.DefaultEnvoyGatewayProvider(), + Telemetry: &v1alpha1.EnvoyGatewayTelemetry{ + Metrics: &v1alpha1.EnvoyGatewayMetrics{ + Sinks: []v1alpha1.EnvoyGatewayMetricSink{ + { + Type: v1alpha1.MetricSinkTypeOpenTelemetry, + }, + }, + }, + }, + }, + }, + expect: false, }, } diff --git a/api/v1alpha1/validation/envoyproxy_validate.go b/api/v1alpha1/validation/envoyproxy_validate.go index 1ba367ef0b8..b1ebbbab80d 100644 --- a/api/v1alpha1/validation/envoyproxy_validate.go +++ b/api/v1alpha1/validation/envoyproxy_validate.go @@ -191,6 +191,17 @@ func validateProxyTelemetry(spec *egv1a1.EnvoyProxySpec) []error { } } + if spec != nil && spec.Telemetry != nil && spec.Telemetry.Metrics != nil { + for _, sink := range spec.Telemetry.Metrics.Sinks { + if sink.Type == egv1a1.MetricSinkTypeOpenTelemetry { + if sink.OpenTelemetry == nil { + err := fmt.Errorf("opentelemetry is required if the sink type is OpenTelemetry") + errs = append(errs, err) + } + } + } + } + return errs } diff --git a/api/v1alpha1/validation/envoyproxy_validate_test.go b/api/v1alpha1/validation/envoyproxy_validate_test.go index 7f4b4b7a952..0f60a6fa593 100644 --- a/api/v1alpha1/validation/envoyproxy_validate_test.go +++ b/api/v1alpha1/validation/envoyproxy_validate_test.go @@ -404,6 +404,50 @@ func TestValidateEnvoyProxy(t *testing.T) { }, }, expected: false, + }, { + name: "should invalid when metrics type is OpenTelemetry, but `OpenTelemetry` field being empty", + proxy: &egv1a1.EnvoyProxy{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test", + Name: "test", + }, + Spec: egv1a1.EnvoyProxySpec{ + Telemetry: &egv1a1.ProxyTelemetry{ + Metrics: &egv1a1.ProxyMetrics{ + Sinks: []egv1a1.ProxyMetricSink{ + { + Type: egv1a1.MetricSinkTypeOpenTelemetry, + }, + }, + }, + }, + }, + }, + expected: false, + }, { + name: "should valid when metrics type is OpenTelemetry and `OpenTelemetry` field being not empty", + proxy: &egv1a1.EnvoyProxy{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test", + Name: "test", + }, + Spec: egv1a1.EnvoyProxySpec{ + Telemetry: &egv1a1.ProxyTelemetry{ + Metrics: &egv1a1.ProxyMetrics{ + Sinks: []egv1a1.ProxyMetricSink{ + { + Type: egv1a1.MetricSinkTypeOpenTelemetry, + OpenTelemetry: &egv1a1.ProxyOpenTelemetrySink{ + Host: "0.0.0.0", + Port: 3217, + }, + }, + }, + }, + }, + }, + }, + expected: true, }, } diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index c9b8600c45f..b5677f62c35 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -545,6 +545,103 @@ func (in *EnvoyGatewayLogging) DeepCopy() *EnvoyGatewayLogging { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayMetricSink) DeepCopyInto(out *EnvoyGatewayMetricSink) { + *out = *in + if in.OpenTelemetry != nil { + in, out := &in.OpenTelemetry, &out.OpenTelemetry + *out = new(EnvoyGatewayOpenTelemetrySink) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayMetricSink. +func (in *EnvoyGatewayMetricSink) DeepCopy() *EnvoyGatewayMetricSink { + if in == nil { + return nil + } + out := new(EnvoyGatewayMetricSink) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayMetrics) DeepCopyInto(out *EnvoyGatewayMetrics) { + *out = *in + if in.Address != nil { + in, out := &in.Address, &out.Address + *out = new(EnvoyGatewayMetricsAddress) + **out = **in + } + if in.Sinks != nil { + in, out := &in.Sinks, &out.Sinks + *out = make([]EnvoyGatewayMetricSink, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Prometheus != nil { + in, out := &in.Prometheus, &out.Prometheus + *out = new(EnvoyGatewayPrometheusProvider) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayMetrics. +func (in *EnvoyGatewayMetrics) DeepCopy() *EnvoyGatewayMetrics { + if in == nil { + return nil + } + out := new(EnvoyGatewayMetrics) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayMetricsAddress) DeepCopyInto(out *EnvoyGatewayMetricsAddress) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayMetricsAddress. +func (in *EnvoyGatewayMetricsAddress) DeepCopy() *EnvoyGatewayMetricsAddress { + if in == nil { + return nil + } + out := new(EnvoyGatewayMetricsAddress) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayOpenTelemetrySink) DeepCopyInto(out *EnvoyGatewayOpenTelemetrySink) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayOpenTelemetrySink. +func (in *EnvoyGatewayOpenTelemetrySink) DeepCopy() *EnvoyGatewayOpenTelemetrySink { + if in == nil { + return nil + } + out := new(EnvoyGatewayOpenTelemetrySink) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayPrometheusProvider) DeepCopyInto(out *EnvoyGatewayPrometheusProvider) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayPrometheusProvider. +func (in *EnvoyGatewayPrometheusProvider) DeepCopy() *EnvoyGatewayPrometheusProvider { + if in == nil { + return nil + } + out := new(EnvoyGatewayPrometheusProvider) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyGatewayProvider) DeepCopyInto(out *EnvoyGatewayProvider) { *out = *in @@ -613,6 +710,11 @@ func (in *EnvoyGatewaySpec) DeepCopyInto(out *EnvoyGatewaySpec) { *out = new(EnvoyGatewayAdmin) (*in).DeepCopyInto(*out) } + if in.Telemetry != nil { + in, out := &in.Telemetry, &out.Telemetry + *out = new(EnvoyGatewayTelemetry) + (*in).DeepCopyInto(*out) + } if in.RateLimit != nil { in, out := &in.RateLimit, &out.RateLimit *out = new(RateLimit) @@ -640,6 +742,26 @@ func (in *EnvoyGatewaySpec) DeepCopy() *EnvoyGatewaySpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EnvoyGatewayTelemetry) DeepCopyInto(out *EnvoyGatewayTelemetry) { + *out = *in + if in.Metrics != nil { + in, out := &in.Metrics, &out.Metrics + *out = new(EnvoyGatewayMetrics) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayTelemetry. +func (in *EnvoyGatewayTelemetry) DeepCopy() *EnvoyGatewayTelemetry { + if in == nil { + return nil + } + out := new(EnvoyGatewayTelemetry) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyJSONPatchConfig) DeepCopyInto(out *EnvoyJSONPatchConfig) { *out = *in @@ -1406,26 +1528,6 @@ func (in *Match) DeepCopy() *Match { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MetricSink) DeepCopyInto(out *MetricSink) { - *out = *in - if in.OpenTelemetry != nil { - in, out := &in.OpenTelemetry, &out.OpenTelemetry - *out = new(OpenTelemetrySink) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricSink. -func (in *MetricSink) DeepCopy() *MetricSink { - if in == nil { - return nil - } - out := new(MetricSink) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *OpenTelemetryEnvoyProxyAccessLog) DeepCopyInto(out *OpenTelemetryEnvoyProxyAccessLog) { *out = *in @@ -1448,36 +1550,6 @@ func (in *OpenTelemetryEnvoyProxyAccessLog) DeepCopy() *OpenTelemetryEnvoyProxyA return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *OpenTelemetrySink) DeepCopyInto(out *OpenTelemetrySink) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OpenTelemetrySink. -func (in *OpenTelemetrySink) DeepCopy() *OpenTelemetrySink { - if in == nil { - return nil - } - out := new(OpenTelemetrySink) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *PrometheusProvider) DeepCopyInto(out *PrometheusProvider) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PrometheusProvider. -func (in *PrometheusProvider) DeepCopy() *PrometheusProvider { - if in == nil { - return nil - } - out := new(PrometheusProvider) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProxyAccessLog) DeepCopyInto(out *ProxyAccessLog) { *out = *in @@ -1617,17 +1689,37 @@ func (in *ProxyLogging) DeepCopy() *ProxyLogging { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProxyMetricSink) DeepCopyInto(out *ProxyMetricSink) { + *out = *in + if in.OpenTelemetry != nil { + in, out := &in.OpenTelemetry, &out.OpenTelemetry + *out = new(ProxyOpenTelemetrySink) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProxyMetricSink. +func (in *ProxyMetricSink) DeepCopy() *ProxyMetricSink { + if in == nil { + return nil + } + out := new(ProxyMetricSink) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProxyMetrics) DeepCopyInto(out *ProxyMetrics) { *out = *in if in.Prometheus != nil { in, out := &in.Prometheus, &out.Prometheus - *out = new(PrometheusProvider) + *out = new(ProxyPrometheusProvider) **out = **in } if in.Sinks != nil { in, out := &in.Sinks, &out.Sinks - *out = make([]MetricSink, len(*in)) + *out = make([]ProxyMetricSink, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -1649,6 +1741,36 @@ func (in *ProxyMetrics) DeepCopy() *ProxyMetrics { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProxyOpenTelemetrySink) DeepCopyInto(out *ProxyOpenTelemetrySink) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProxyOpenTelemetrySink. +func (in *ProxyOpenTelemetrySink) DeepCopy() *ProxyOpenTelemetrySink { + if in == nil { + return nil + } + out := new(ProxyOpenTelemetrySink) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ProxyPrometheusProvider) DeepCopyInto(out *ProxyPrometheusProvider) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ProxyPrometheusProvider. +func (in *ProxyPrometheusProvider) DeepCopy() *ProxyPrometheusProvider { + if in == nil { + return nil + } + out := new(ProxyPrometheusProvider) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ProxyTelemetry) DeepCopyInto(out *ProxyTelemetry) { *out = *in diff --git a/internal/infrastructure/kubernetes/proxy/resource_provider_test.go b/internal/infrastructure/kubernetes/proxy/resource_provider_test.go index 9e1602f44d4..1c784927385 100644 --- a/internal/infrastructure/kubernetes/proxy/resource_provider_test.go +++ b/internal/infrastructure/kubernetes/proxy/resource_provider_test.go @@ -259,7 +259,7 @@ func TestDeployment(t *testing.T) { infra: newTestInfra(), telemetry: &egv1a1.ProxyTelemetry{ Metrics: &egv1a1.ProxyMetrics{ - Prometheus: &egv1a1.PrometheusProvider{}, + Prometheus: &egv1a1.ProxyPrometheusProvider{}, }, }, }, @@ -353,7 +353,7 @@ func TestDeployment(t *testing.T) { } else { tc.infra.Proxy.Config.Spec.Telemetry = &egv1a1.ProxyTelemetry{ Metrics: &egv1a1.ProxyMetrics{ - Prometheus: &egv1a1.PrometheusProvider{ + Prometheus: &egv1a1.ProxyPrometheusProvider{ Disable: true, }, }, diff --git a/internal/xds/bootstrap/bootstrap_test.go b/internal/xds/bootstrap/bootstrap_test.go index bd246c212ea..b145045a8b3 100644 --- a/internal/xds/bootstrap/bootstrap_test.go +++ b/internal/xds/bootstrap/bootstrap_test.go @@ -24,7 +24,7 @@ func TestGetRenderedBootstrapConfig(t *testing.T) { { name: "disable-prometheus", proxyMetrics: &egv1a1.ProxyMetrics{ - Prometheus: &egv1a1.PrometheusProvider{ + Prometheus: &egv1a1.ProxyPrometheusProvider{ Disable: true, }, }, @@ -32,19 +32,19 @@ func TestGetRenderedBootstrapConfig(t *testing.T) { { name: "enable-prometheus", proxyMetrics: &egv1a1.ProxyMetrics{ - Prometheus: &egv1a1.PrometheusProvider{}, + Prometheus: &egv1a1.ProxyPrometheusProvider{}, }, }, { name: "otel-metrics", proxyMetrics: &egv1a1.ProxyMetrics{ - Prometheus: &egv1a1.PrometheusProvider{ + Prometheus: &egv1a1.ProxyPrometheusProvider{ Disable: true, }, - Sinks: []egv1a1.MetricSink{ + Sinks: []egv1a1.ProxyMetricSink{ { Type: egv1a1.MetricSinkTypeOpenTelemetry, - OpenTelemetry: &egv1a1.OpenTelemetrySink{ + OpenTelemetry: &egv1a1.ProxyOpenTelemetrySink{ Host: "otel-collector.monitoring.svc", Port: 4317, }, diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index 1f3545304b6..c77542b768b 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -253,6 +253,7 @@ EnvoyGateway is the schema for the envoygateways API. | `provider` _[EnvoyGatewayProvider](#envoygatewayprovider)_ | Provider defines the desired provider and provider-specific configuration. If unspecified, the Kubernetes provider is used with default configuration parameters. | | `logging` _[EnvoyGatewayLogging](#envoygatewaylogging)_ | Logging defines logging parameters for Envoy Gateway. | | `admin` _[EnvoyGatewayAdmin](#envoygatewayadmin)_ | Admin defines the desired admin related abilities. If unspecified, the Admin is used with default configuration parameters. | +| `telemetry` _[EnvoyGatewayTelemetry](#envoygatewaytelemetry)_ | Telemetry defines the desired control plane telemetry related abilities. If unspecified, the telemetry is used with default configuration. | | `rateLimit` _[RateLimit](#ratelimit)_ | RateLimit defines the configuration associated with the Rate Limit service deployed by Envoy Gateway required to implement the Global Rate limiting functionality. The specific rate limit service used here is the reference implementation in Envoy. For more details visit https://github.com/envoyproxy/ratelimit. This configuration is unneeded for "Local" rate limiting. | | `extensionManager` _[ExtensionManager](#extensionmanager)_ | ExtensionManager defines an extension manager to register for the Envoy Gateway Control Plane. | | `extensionApis` _[ExtensionAPISettings](#extensionapisettings)_ | ExtensionAPIs defines the settings related to specific Gateway API Extensions implemented by Envoy Gateway | @@ -388,6 +389,82 @@ _Appears in:_ | `level` _object (keys:[EnvoyGatewayLogComponent](#envoygatewaylogcomponent), values:[LogLevel](#loglevel))_ | Level is the logging level. If unspecified, defaults to "info". EnvoyGatewayLogComponent options: default/provider/gateway-api/xds-translator/xds-server/infrastructure/global-ratelimit. LogLevel options: debug/info/error/warn. | +#### EnvoyGatewayMetricSink + + + +EnvoyGatewayMetricSink defines control plane metric sinks where metrics are sent to. + +_Appears in:_ +- [EnvoyGatewayMetrics](#envoygatewaymetrics) + +| Field | Description | +| --- | --- | +| `type` _[MetricSinkType](#metricsinktype)_ | Type defines the metric sink type. EG control plane currently supports OpenTelemetry. | +| `openTelemetry` _[EnvoyGatewayOpenTelemetrySink](#envoygatewayopentelemetrysink)_ | OpenTelemetry defines the configuration for OpenTelemetry sink. It's required if the sink type is OpenTelemetry. | + + +#### EnvoyGatewayMetrics + + + +EnvoyGatewayMetrics defines control plane push/pull metrics configurations. + +_Appears in:_ +- [EnvoyGatewayTelemetry](#envoygatewaytelemetry) + +| Field | Description | +| --- | --- | +| `Address` _[EnvoyGatewayMetricsAddress](#envoygatewaymetricsaddress)_ | Address defines the address of Envoy Gateway Metrics Server. | +| `sinks` _[EnvoyGatewayMetricSink](#envoygatewaymetricsink) array_ | Sinks defines the metric sinks where metrics are sent to. | +| `prometheus` _[EnvoyGatewayPrometheusProvider](#envoygatewayprometheusprovider)_ | Prometheus defines the configuration for prometheus endpoint. | + + +#### EnvoyGatewayMetricsAddress + + + +EnvoyGatewayMetricsAddress defines the Envoy Gateway Metrics Address configuration. + +_Appears in:_ +- [EnvoyGatewayMetrics](#envoygatewaymetrics) + +| Field | Description | +| --- | --- | +| `port` _integer_ | Port defines the port the metrics server is exposed on. | +| `host` _string_ | Host defines the metrics server hostname. | + + +#### EnvoyGatewayOpenTelemetrySink + + + + + +_Appears in:_ +- [EnvoyGatewayMetricSink](#envoygatewaymetricsink) + +| Field | Description | +| --- | --- | +| `host` _string_ | Host define the sink service hostname. | +| `protocol` _string_ | Protocol define the sink service protocol. | +| `port` _integer_ | Port defines the port the sink service is exposed on. | + + +#### EnvoyGatewayPrometheusProvider + + + +EnvoyGatewayPrometheusProvider will expose prometheus endpoint in pull mode. + +_Appears in:_ +- [EnvoyGatewayMetrics](#envoygatewaymetrics) + +| Field | Description | +| --- | --- | +| `disable` _boolean_ | Disable defines if disables the prometheus metrics in pull mode. | + + #### EnvoyGatewayProvider @@ -435,11 +512,27 @@ _Appears in:_ | `provider` _[EnvoyGatewayProvider](#envoygatewayprovider)_ | Provider defines the desired provider and provider-specific configuration. If unspecified, the Kubernetes provider is used with default configuration parameters. | | `logging` _[EnvoyGatewayLogging](#envoygatewaylogging)_ | Logging defines logging parameters for Envoy Gateway. | | `admin` _[EnvoyGatewayAdmin](#envoygatewayadmin)_ | Admin defines the desired admin related abilities. If unspecified, the Admin is used with default configuration parameters. | +| `telemetry` _[EnvoyGatewayTelemetry](#envoygatewaytelemetry)_ | Telemetry defines the desired control plane telemetry related abilities. If unspecified, the telemetry is used with default configuration. | | `rateLimit` _[RateLimit](#ratelimit)_ | RateLimit defines the configuration associated with the Rate Limit service deployed by Envoy Gateway required to implement the Global Rate limiting functionality. The specific rate limit service used here is the reference implementation in Envoy. For more details visit https://github.com/envoyproxy/ratelimit. This configuration is unneeded for "Local" rate limiting. | | `extensionManager` _[ExtensionManager](#extensionmanager)_ | ExtensionManager defines an extension manager to register for the Envoy Gateway Control Plane. | | `extensionApis` _[ExtensionAPISettings](#extensionapisettings)_ | ExtensionAPIs defines the settings related to specific Gateway API Extensions implemented by Envoy Gateway | +#### EnvoyGatewayTelemetry + + + +EnvoyGatewayTelemetry defines telemetry configurations for envoy gateway control plane. Control plane will focus on metrics observability telemetry and tracing telemetry later. + +_Appears in:_ +- [EnvoyGateway](#envoygateway) +- [EnvoyGatewaySpec](#envoygatewayspec) + +| Field | Description | +| --- | --- | +| `metrics` _[EnvoyGatewayMetrics](#envoygatewaymetrics)_ | Metrics defines metrics configuration for envoy gateway. | + + #### EnvoyJSONPatchConfig @@ -980,21 +1073,6 @@ _Appears in:_ -#### MetricSink - - - - - -_Appears in:_ -- [ProxyMetrics](#proxymetrics) - -| Field | Description | -| --- | --- | -| `type` _[MetricSinkType](#metricsinktype)_ | Type defines the metric sink type. EG currently only supports OpenTelemetry. | -| `openTelemetry` _[OpenTelemetrySink](#opentelemetrysink)_ | OpenTelemetry defines the configuration for OpenTelemetry sink. It's required if the sink type is OpenTelemetry. | - - #### MetricSinkType _Underlying type:_ `string` @@ -1002,7 +1080,8 @@ _Underlying type:_ `string` _Appears in:_ -- [MetricSink](#metricsink) +- [EnvoyGatewayMetricSink](#envoygatewaymetricsink) +- [ProxyMetricSink](#proxymetricsink) @@ -1022,35 +1101,6 @@ _Appears in:_ | `resources` _object (keys:string, values:string)_ | Resources is a set of labels that describe the source of a log entry, including envoy node info. It's recommended to follow [semantic conventions](https://opentelemetry.io/docs/reference/specification/resource/semantic_conventions/). | -#### OpenTelemetrySink - - - - - -_Appears in:_ -- [MetricSink](#metricsink) - -| Field | Description | -| --- | --- | -| `host` _string_ | Host define the service hostname. | -| `port` _integer_ | Port defines the port the service is exposed on. | - - -#### PrometheusProvider - - - - - -_Appears in:_ -- [ProxyMetrics](#proxymetrics) - -| Field | Description | -| --- | --- | -| `disable` _boolean_ | Disable the Prometheus endpoint. | - - #### ProviderType _Underlying type:_ `string` @@ -1187,6 +1237,21 @@ _Appears in:_ | `level` _object (keys:[ProxyLogComponent](#proxylogcomponent), values:[LogLevel](#loglevel))_ | Level is a map of logging level per component, where the component is the key and the log level is the value. If unspecified, defaults to "default: warn". | +#### ProxyMetricSink + + + + + +_Appears in:_ +- [ProxyMetrics](#proxymetrics) + +| Field | Description | +| --- | --- | +| `type` _[MetricSinkType](#metricsinktype)_ | Type defines the metric sink type. EG currently only supports OpenTelemetry. | +| `openTelemetry` _[ProxyOpenTelemetrySink](#proxyopentelemetrysink)_ | OpenTelemetry defines the configuration for OpenTelemetry sink. It's required if the sink type is OpenTelemetry. | + + #### ProxyMetrics @@ -1198,12 +1263,41 @@ _Appears in:_ | Field | Description | | --- | --- | -| `prometheus` _[PrometheusProvider](#prometheusprovider)_ | Prometheus defines the configuration for Admin endpoint `/stats/prometheus`. | -| `sinks` _[MetricSink](#metricsink) array_ | Sinks defines the metric sinks where metrics are sent to. | +| `prometheus` _[ProxyPrometheusProvider](#proxyprometheusprovider)_ | Prometheus defines the configuration for Admin endpoint `/stats/prometheus`. | +| `sinks` _[ProxyMetricSink](#proxymetricsink) array_ | Sinks defines the metric sinks where metrics are sent to. | | `matches` _[Match](#match) array_ | Matches defines configuration for selecting specific metrics instead of generating all metrics stats that are enabled by default. This helps reduce CPU and memory overhead in Envoy, but eliminating some stats may after critical functionality. Here are the stats that we strongly recommend not disabling: `cluster_manager.warming_clusters`, `cluster..membership_total`,`cluster..membership_healthy`, `cluster..membership_degraded`,reference https://github.com/envoyproxy/envoy/issues/9856, https://github.com/envoyproxy/envoy/issues/14610 | | `enableVirtualHostStats` _boolean_ | EnableVirtualHostStats enables envoy stat metrics for virtual hosts. | +#### ProxyOpenTelemetrySink + + + + + +_Appears in:_ +- [ProxyMetricSink](#proxymetricsink) + +| Field | Description | +| --- | --- | +| `host` _string_ | Host define the service hostname. | +| `port` _integer_ | Port defines the port the service is exposed on. | + + +#### ProxyPrometheusProvider + + + + + +_Appears in:_ +- [ProxyMetrics](#proxymetrics) + +| Field | Description | +| --- | --- | +| `disable` _boolean_ | Disable the Prometheus endpoint. | + + #### ProxyTelemetry diff --git a/site/content/en/latest/design/eg-metrics.md b/site/content/en/latest/design/eg-metrics.md new file mode 100644 index 00000000000..a16c63156d8 --- /dev/null +++ b/site/content/en/latest/design/eg-metrics.md @@ -0,0 +1,259 @@ +--- +date: 2023-10-10 +title: "Control Plane Observability: Metrics" +author: Xunzhuo Liu +linkTitle: "Control Plane Observability: Metrics" +--- + +{{% alert title="State" color="warning" %}} + ++ Author: [Xunzhuo Liu](https://github.com/Xunzhuo) ++ Affiliation: Tencent ++ Data: 2023-10-12 ++ Status: Done +{{% /alert %}} + +This document aims to cover all aspects of envoy gateway control plane metrics observability. + +{{% alert title="Note" color="secondary" %}} +**Data plane** observability (while important) is outside of scope for this document. +{{% /alert %}} + +## Current State + +At present, the Envoy Gateway control plane provides logs and controller-runtime metrics, without traces. Logs are managed through our proprietary library (`internal/logging`, a shim to `zap`) and are written to `/dev/stdout`. + +The absence of comprehensive and robust control plane metrics observability hinders the effective monitoring of Envoy Gateway in a production environment, a critical requirement before deploying Envoy Gateway into production. + +## Goals + +Our objectives include: + ++ Supporting **PULL** mode for Prometheus metrics and exposing these metrics on the admin address. ++ Supporting **PUSH** mode for Prometheus metrics, thereby sending metrics to the Open Telemetry Stats sink. + +## Non-Goals + +Our non-goals include: + ++ Supporting other stats sinks. ++ Only focusing on code design and does not provide specific code implementation. + +## Use-Cases + +The use-cases include: + ++ Exposing Prometheus metrics in the Envoy Gateway Control Plane. ++ Pushing Envoy Gateway Control Plane metrics via the Open Telemetry Sink. + +## Design + +### Standards + +Our metrics, and traces in the future, will be built upon the [OpenTelemetry](https://opentelemetry.io/) standards. All metrics will be configured via the [OpenTelemetry SDK](https://opentelemetry.io/docs/specs/otel/metrics/sdk/), which offers neutral libraries that can be connected to various backends. + +This approach allows the Envoy Gateway code to concentrate on the crucial aspect - generating the metrics - and delegate all other tasks to systems designed for telemetry ingestion. + +### Attributes + +OpenTelemetry defines a set of [Semantic Conventions](https://opentelemetry.io/docs/concepts/semantic-conventions/), including [Kubernetes specific ones](https://opentelemetry.io/docs/specs/otel/resource/semantic_conventions/k8s/). + +These attributes can be expressed in logs (as keys of structured logs), traces (as attributes), and metrics (as labels). + +We aim to use attributes consistently where applicable. Where possible, these should adhere to codified Semantic Conventions; when not possible, they should maintain consistency across the project. + +### Extensibility + +Envoy Gateway supports both **PULL/PUSH** mode metrics, with Metrics exported via Prometheus by default. + +Additionally, Envoy Gateway can export metrics using both the [OTEL gRPC metrics exporter](https://opentelemetry.io/docs/specs/otel/metrics/sdk_exporters/otlp/#general) and [OTEL HTTP metrics exporter](https://opentelemetry.io/docs/specs/otel/metrics/sdk_exporters/otlp/#general), which pushes metrics by grpc/http to a remote OTEL collector. + +Users can extend these in two ways: + +#### Downstream Collection + +Based on the exported data, other tools can collect, process, and export telemetry as needed. Some examples include: + ++ Metrics in **PULL** mode: The OTEL collector can scrape Prometheus and export to X. ++ Metrics in **PUSH** mode: The OTEL collector can receive OTEL gRPC/HTTP exporter metrics and export to X. + +While the examples above involve OTEL collectors, there are numerous other systems available. + +#### Vendor extensions + +The OTEL libraries allow for the registration of Providers/Handlers. While we will offer the default ones (PULL via Prometheus, PUSH via OTEL HTTP metrics exporter) mentioned in Envoy Gateway's extensibility, we can easily allow custom builds of Envoy Gateway to plug in alternatives if the default options don't meet their needs. + +For instance, users may prefer to write metrics over the OTLP gRPC metrics exporter instead of the HTTP metrics exporter. This is perfectly acceptable -- and almost impossible to prevent. The OTEL has ways to register their providers/exporters, and Envoy Gateway can ensure its usage is such that it's not overly difficult to swap out a different provider/exporter. + +### Stability + +Observability is, in essence, a user-facing API. Its primary purpose is to be consumed - by both humans and tooling. Therefore, having well-defined guarantees around their formats is crucial. + +Please note that this refers only to the contents of the telemetry - what we emit, the names of things, semantics, etc. Other settings like Prometheus vs OTLP, JSON vs plaintext, logging levels, etc., are not considered. + +I propose the following: + +#### Metrics + +Metrics offer the greatest potential for providing guarantees. They often directly influence alerts and dashboards, making changes highly impactful. This contrasts with traces and logs, which are often used for ad-hoc analysis, where minor changes to information can be easily understood by a human. + +Moreover, there is precedent for this: [Kubernetes Metrics Lifecycle](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#metric-lifecycle) has well-defined processes, and Envoy Gateway's dataplane (Envoy Proxy) metrics are de facto stable. + +Currently, all Envoy Gateway metrics lack defined stability. I suggest we categorize all existing metrics as either: + ++ ***Deprecated***: a metric that is intended to be phased out. ++ ***Experimental***: a metric that is off by default. ++ ***Alpha***: a metric that is on by default. + +We should aim to promote a core set of metrics to **Stable** within a few releases. + +## Envoy Gateway API Types + +New APIs will be added to Envoy Gateway config, which are used to manage Control Plane Telemetry bootstrap configs. + +### EnvoyGatewayTelemetry + +```go +// EnvoyGatewayTelemetry defines telemetry configurations for envoy gateway control plane. +// Control plane will focus on metrics observability telemetry and tracing telemetry later. +type EnvoyGatewayTelemetry struct { + // Metrics defines metrics configuration for envoy gateway. + Metrics *EnvoyGatewayMetrics `json:"metrics,omitempty"` +} +``` + +### EnvoyGatewayMetrics + +```go +// EnvoyGatewayMetrics defines control plane push/pull metrics configurations. +type EnvoyGatewayMetrics struct { + // Address defines the address of Envoy Gateway Metrics Server. + Address *EnvoyGatewayMetricsAddress + // Sinks defines the metric sinks where metrics are sent to. + Sinks []EnvoyGatewayMetricSink `json:"sinks,omitempty"` + // Prometheus defines the configuration for prometheus endpoint. + Prometheus *EnvoyGatewayPrometheusProvider `json:"prometheus,omitempty"` +} + +// EnvoyGatewayMetricSink defines control plane +// metric sinks where metrics are sent to. +type EnvoyGatewayMetricSink struct { + // Type defines the metric sink type. + // EG control plane currently supports OpenTelemetry. + // +kubebuilder:validation:Enum=OpenTelemetry + // +kubebuilder:default=OpenTelemetry + Type MetricSinkType `json:"type"` + // OpenTelemetry defines the configuration for OpenTelemetry sink. + // It's required if the sink type is OpenTelemetry. + OpenTelemetry *EnvoyGatewayOpenTelemetrySink `json:"openTelemetry,omitempty"` +} + +type EnvoyGatewayOpenTelemetrySink struct { + // Host define the sink service hostname. + Host string `json:"host"` + // Protocol define the sink service protocol. + // +kubebuilder:validation:Enum=grpc;http + Protocol string `json:"protocol"` + // Port defines the port the sink service is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=4317 + Port int32 `json:"port,omitempty"` +} + +// EnvoyGatewayPrometheusProvider will expose prometheus endpoint in pull mode. +type EnvoyGatewayPrometheusProvider struct { + // Disable defines if disables the prometheus metrics in pull mode. + // + Disable bool `json:"disable,omitempty"` +} + +// EnvoyGatewayMetricsAddress defines the Envoy Gateway Metrics Address configuration. +type EnvoyGatewayMetricsAddress struct { + // Port defines the port the metrics server is exposed on. + // + // +optional + // +kubebuilder:validation:Minimum=0 + // +kubebuilder:default=19001 + Port int `json:"port,omitempty"` + // Host defines the metrics server hostname. + // + // +optional + // +kubebuilder:default="0.0.0.0" + Host string `json:"host,omitempty"` +} + +``` + +#### Example + ++ The following is an example to enable prometheus metric. + +```yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyGateway +gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +logging: + level: null + default: info +provider: + type: Kubernetes +telemetry: + metrics: + address: + host: 0.0.0.0 + port: 19001 + prometheus: + disable: false +``` + ++ The following is an example to send metric via Open Telemetry sink to OTEL gRPC Collector. + +```yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyGateway +gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +logging: + level: null + default: info +provider: + type: Kubernetes +telemetry: + metrics: + sinks: + - type: OpenTelemetry + openTelemetry: + host: otel-collector.monitoring.svc.cluster.local + port: 4317 + protocol: grpc +``` + ++ The following is an example to enable prometheus metric and send metric via Open Telemetry sink to OTEL HTTP Collector at the same time. + +```yaml +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyGateway +gateway: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +logging: + level: null + default: info +provider: + type: Kubernetes +telemetry: + metrics: + address: + host: 0.0.0.0 + port: 19001 + prometheus: + disable: false + sinks: + - type: OpenTelemetry + openTelemetry: + host: otel-collector.monitoring.svc.cluster.local + port: 4318 + protocol: http +``` diff --git a/site/content/en/latest/install/install-helm.md b/site/content/en/latest/install/install-helm.md index e83575ea5b1..41a122bb3ff 100644 --- a/site/content/en/latest/install/install-helm.md +++ b/site/content/en/latest/install/install-helm.md @@ -125,9 +125,9 @@ These are the ports used by Envoy Gateway and the managed Envoy Proxy. ### Envoy Gateway | Envoy Gateway | Address | Port | Configurable | -|:----------------------:|:---------:|:------:|:--------------:| -| Xds EnvoyProxy Server | 0.0.0.0 | 18000 | Yes | -| Xds RateLimit Server | 0.0.0.0 | 18001 | Yes | +|:----------------------:|:---------:|:------:| :------: | +| Xds EnvoyProxy Server | 0.0.0.0 | 18000 | No | +| Xds RateLimit Server | 0.0.0.0 | 18001 | No | | Admin Server | 127.0.0.1 | 19000 | Yes | | Metrics Server | 0.0.0.0 | 19001 | Yes | | Health Check | 127.0.0.1 | 8081 | No | From d4c012f13c2924ec3f1806f4ce691735488f13b0 Mon Sep 17 00:00:00 2001 From: bitliu Date: Mon, 23 Oct 2023 14:59:22 +0800 Subject: [PATCH 2/4] remove prometheues address Signed-off-by: bitliu --- api/v1alpha1/envoygateway_helpers.go | 23 +----------- api/v1alpha1/envoygateway_metrics_types.go | 17 --------- api/v1alpha1/zz_generated.deepcopy.go | 20 ----------- site/content/en/latest/api/extension_types.md | 16 --------- site/content/en/latest/design/eg-metrics.md | 35 ++++--------------- .../content/en/latest/install/install-helm.md | 8 ++--- 6 files changed, 12 insertions(+), 107 deletions(-) diff --git a/api/v1alpha1/envoygateway_helpers.go b/api/v1alpha1/envoygateway_helpers.go index cbb615b922f..e9e369f4e74 100644 --- a/api/v1alpha1/envoygateway_helpers.go +++ b/api/v1alpha1/envoygateway_helpers.go @@ -98,9 +98,7 @@ func (e *EnvoyGateway) GetEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { if e.Telemetry.Metrics.Prometheus == nil { e.Telemetry.Metrics.Prometheus = DefaultEnvoyGatewayPrometheus() } - if e.Telemetry.Metrics.Address == nil { - e.Telemetry.Metrics.Address = DefaultEnvoyGatewayMetricsAddress() - } + if e.Telemetry.Metrics == nil { e.Telemetry.Metrics = DefaultEnvoyGatewayMetrics() } @@ -111,16 +109,6 @@ func (e *EnvoyGateway) GetEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { return e.Telemetry } -// GetEnvoyGatewayMetricsAddress returns the EnvoyGateway Metrics Address. -func (e *EnvoyGateway) GetEnvoyGatewayMetricsAddress() string { - address := e.GetEnvoyGatewayTelemetry().Metrics.Address - if address != nil { - return fmt.Sprintf("%s:%d", address.Host, address.Port) - } - - return "" -} - // IfDisablePrometheus returns if disable prometheus. func (e *EnvoyGateway) IfDisablePrometheus() bool { return e.GetEnvoyGatewayTelemetry().Metrics.Prometheus.Disable @@ -136,19 +124,10 @@ func DefaultEnvoyGatewayTelemetry() *EnvoyGatewayTelemetry { // DefaultEnvoyGatewayMetrics returns a new EnvoyGatewayMetrics with default configuration parameters. func DefaultEnvoyGatewayMetrics() *EnvoyGatewayMetrics { return &EnvoyGatewayMetrics{ - Address: DefaultEnvoyGatewayMetricsAddress(), Prometheus: DefaultEnvoyGatewayPrometheus(), } } -// DefaultEnvoyGatewayMetricsAddress returns a new EnvoyGatewayMetrics with default configuration parameters. -func DefaultEnvoyGatewayMetricsAddress() *EnvoyGatewayMetricsAddress { - return &EnvoyGatewayMetricsAddress{ - Host: GatewayMetricsHost, - Port: GatewayMetricsPort, - } -} - // DefaultEnvoyGatewayPrometheus returns a new EnvoyGatewayMetrics with default configuration parameters. func DefaultEnvoyGatewayPrometheus() *EnvoyGatewayPrometheusProvider { return &EnvoyGatewayPrometheusProvider{ diff --git a/api/v1alpha1/envoygateway_metrics_types.go b/api/v1alpha1/envoygateway_metrics_types.go index 96fa3f4c587..0b827a06afa 100644 --- a/api/v1alpha1/envoygateway_metrics_types.go +++ b/api/v1alpha1/envoygateway_metrics_types.go @@ -7,8 +7,6 @@ package v1alpha1 // EnvoyGatewayMetrics defines control plane push/pull metrics configurations. type EnvoyGatewayMetrics struct { - // Address defines the address of Envoy Gateway Metrics Server. - Address *EnvoyGatewayMetricsAddress // Sinks defines the metric sinks where metrics are sent to. Sinks []EnvoyGatewayMetricSink `json:"sinks,omitempty"` // Prometheus defines the configuration for prometheus endpoint. @@ -48,18 +46,3 @@ type EnvoyGatewayPrometheusProvider struct { // Disable bool `json:"disable,omitempty"` } - -// EnvoyGatewayMetricsAddress defines the Envoy Gateway Metrics Address configuration. -type EnvoyGatewayMetricsAddress struct { - // Port defines the port the metrics server is exposed on. - // - // +optional - // +kubebuilder:validation:Minimum=0 - // +kubebuilder:default=19001 - Port int `json:"port,omitempty"` - // Host defines the metrics server hostname. - // - // +optional - // +kubebuilder:default="0.0.0.0" - Host string `json:"host,omitempty"` -} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index b5677f62c35..be820081ebf 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -568,11 +568,6 @@ func (in *EnvoyGatewayMetricSink) DeepCopy() *EnvoyGatewayMetricSink { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyGatewayMetrics) DeepCopyInto(out *EnvoyGatewayMetrics) { *out = *in - if in.Address != nil { - in, out := &in.Address, &out.Address - *out = new(EnvoyGatewayMetricsAddress) - **out = **in - } if in.Sinks != nil { in, out := &in.Sinks, &out.Sinks *out = make([]EnvoyGatewayMetricSink, len(*in)) @@ -597,21 +592,6 @@ func (in *EnvoyGatewayMetrics) DeepCopy() *EnvoyGatewayMetrics { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *EnvoyGatewayMetricsAddress) DeepCopyInto(out *EnvoyGatewayMetricsAddress) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EnvoyGatewayMetricsAddress. -func (in *EnvoyGatewayMetricsAddress) DeepCopy() *EnvoyGatewayMetricsAddress { - if in == nil { - return nil - } - out := new(EnvoyGatewayMetricsAddress) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EnvoyGatewayOpenTelemetrySink) DeepCopyInto(out *EnvoyGatewayOpenTelemetrySink) { *out = *in diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index c77542b768b..d718cd49290 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -415,26 +415,10 @@ _Appears in:_ | Field | Description | | --- | --- | -| `Address` _[EnvoyGatewayMetricsAddress](#envoygatewaymetricsaddress)_ | Address defines the address of Envoy Gateway Metrics Server. | | `sinks` _[EnvoyGatewayMetricSink](#envoygatewaymetricsink) array_ | Sinks defines the metric sinks where metrics are sent to. | | `prometheus` _[EnvoyGatewayPrometheusProvider](#envoygatewayprometheusprovider)_ | Prometheus defines the configuration for prometheus endpoint. | -#### EnvoyGatewayMetricsAddress - - - -EnvoyGatewayMetricsAddress defines the Envoy Gateway Metrics Address configuration. - -_Appears in:_ -- [EnvoyGatewayMetrics](#envoygatewaymetrics) - -| Field | Description | -| --- | --- | -| `port` _integer_ | Port defines the port the metrics server is exposed on. | -| `host` _string_ | Host defines the metrics server hostname. | - - #### EnvoyGatewayOpenTelemetrySink diff --git a/site/content/en/latest/design/eg-metrics.md b/site/content/en/latest/design/eg-metrics.md index a16c63156d8..c74cd98c06e 100644 --- a/site/content/en/latest/design/eg-metrics.md +++ b/site/content/en/latest/design/eg-metrics.md @@ -113,7 +113,7 @@ New APIs will be added to Envoy Gateway config, which are used to manage Control ### EnvoyGatewayTelemetry -```go +``` go // EnvoyGatewayTelemetry defines telemetry configurations for envoy gateway control plane. // Control plane will focus on metrics observability telemetry and tracing telemetry later. type EnvoyGatewayTelemetry struct { @@ -124,11 +124,11 @@ type EnvoyGatewayTelemetry struct { ### EnvoyGatewayMetrics -```go +> Prometheus will be exposed on 0.0.0.0:19001, which is not supported to be configured yet. + +``` go // EnvoyGatewayMetrics defines control plane push/pull metrics configurations. type EnvoyGatewayMetrics struct { - // Address defines the address of Envoy Gateway Metrics Server. - Address *EnvoyGatewayMetricsAddress // Sinks defines the metric sinks where metrics are sent to. Sinks []EnvoyGatewayMetricSink `json:"sinks,omitempty"` // Prometheus defines the configuration for prometheus endpoint. @@ -169,28 +169,13 @@ type EnvoyGatewayPrometheusProvider struct { Disable bool `json:"disable,omitempty"` } -// EnvoyGatewayMetricsAddress defines the Envoy Gateway Metrics Address configuration. -type EnvoyGatewayMetricsAddress struct { - // Port defines the port the metrics server is exposed on. - // - // +optional - // +kubebuilder:validation:Minimum=0 - // +kubebuilder:default=19001 - Port int `json:"port,omitempty"` - // Host defines the metrics server hostname. - // - // +optional - // +kubebuilder:default="0.0.0.0" - Host string `json:"host,omitempty"` -} - ``` #### Example + The following is an example to enable prometheus metric. -```yaml +``` yaml apiVersion: gateway.envoyproxy.io/v1alpha1 kind: EnvoyGateway gateway: @@ -202,16 +187,13 @@ provider: type: Kubernetes telemetry: metrics: - address: - host: 0.0.0.0 - port: 19001 prometheus: disable: false ``` + The following is an example to send metric via Open Telemetry sink to OTEL gRPC Collector. -```yaml +``` yaml apiVersion: gateway.envoyproxy.io/v1alpha1 kind: EnvoyGateway gateway: @@ -233,7 +215,7 @@ telemetry: + The following is an example to enable prometheus metric and send metric via Open Telemetry sink to OTEL HTTP Collector at the same time. -```yaml +``` yaml apiVersion: gateway.envoyproxy.io/v1alpha1 kind: EnvoyGateway gateway: @@ -245,9 +227,6 @@ provider: type: Kubernetes telemetry: metrics: - address: - host: 0.0.0.0 - port: 19001 prometheus: disable: false sinks: diff --git a/site/content/en/latest/install/install-helm.md b/site/content/en/latest/install/install-helm.md index 41a122bb3ff..3f3c57e1db9 100644 --- a/site/content/en/latest/install/install-helm.md +++ b/site/content/en/latest/install/install-helm.md @@ -126,11 +126,11 @@ These are the ports used by Envoy Gateway and the managed Envoy Proxy. | Envoy Gateway | Address | Port | Configurable | |:----------------------:|:---------:|:------:| :------: | -| Xds EnvoyProxy Server | 0.0.0.0 | 18000 | No | -| Xds RateLimit Server | 0.0.0.0 | 18001 | No | +| Xds EnvoyProxy Server | 0.0.0.0 | 18000 | No | +| Xds RateLimit Server | 0.0.0.0 | 18001 | No | | Admin Server | 127.0.0.1 | 19000 | Yes | -| Metrics Server | 0.0.0.0 | 19001 | Yes | -| Health Check | 127.0.0.1 | 8081 | No | +| Metrics Server | 0.0.0.0 | 19001 | No | +| Health Check | 127.0.0.1 | 8081 | No | ### EnvoyProxy From 77ea45ff40e5166ba7146e77e01b2dd711b64fca Mon Sep 17 00:00:00 2001 From: bitliu Date: Tue, 24 Oct 2023 15:10:17 +0800 Subject: [PATCH 3/4] update Signed-off-by: bitliu --- site/content/en/latest/design/eg-metrics.md | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/site/content/en/latest/design/eg-metrics.md b/site/content/en/latest/design/eg-metrics.md index c74cd98c06e..e3879405b4f 100644 --- a/site/content/en/latest/design/eg-metrics.md +++ b/site/content/en/latest/design/eg-metrics.md @@ -1,8 +1,6 @@ --- date: 2023-10-10 title: "Control Plane Observability: Metrics" -author: Xunzhuo Liu -linkTitle: "Control Plane Observability: Metrics" --- {{% alert title="State" color="warning" %}} @@ -16,15 +14,13 @@ linkTitle: "Control Plane Observability: Metrics" This document aims to cover all aspects of envoy gateway control plane metrics observability. {{% alert title="Note" color="secondary" %}} -**Data plane** observability (while important) is outside of scope for this document. +**Data plane** observability (while important) is outside of scope for this document. For dataplane observability, refer to [here](./metrics). {{% /alert %}} ## Current State At present, the Envoy Gateway control plane provides logs and controller-runtime metrics, without traces. Logs are managed through our proprietary library (`internal/logging`, a shim to `zap`) and are written to `/dev/stdout`. -The absence of comprehensive and robust control plane metrics observability hinders the effective monitoring of Envoy Gateway in a production environment, a critical requirement before deploying Envoy Gateway into production. - ## Goals Our objectives include: @@ -37,7 +33,6 @@ Our objectives include: Our non-goals include: + Supporting other stats sinks. -+ Only focusing on code design and does not provide specific code implementation. ## Use-Cases @@ -50,7 +45,7 @@ The use-cases include: ### Standards -Our metrics, and traces in the future, will be built upon the [OpenTelemetry](https://opentelemetry.io/) standards. All metrics will be configured via the [OpenTelemetry SDK](https://opentelemetry.io/docs/specs/otel/metrics/sdk/), which offers neutral libraries that can be connected to various backends. +Our metrics, will be built upon the [OpenTelemetry](https://opentelemetry.io/) standards. All metrics will be configured via the [OpenTelemetry SDK](https://opentelemetry.io/docs/specs/otel/metrics/sdk/), which offers neutral libraries that can be connected to various backends. This approach allows the Envoy Gateway code to concentrate on the crucial aspect - generating the metrics - and delegate all other tasks to systems designed for telemetry ingestion. @@ -173,7 +168,7 @@ type EnvoyGatewayPrometheusProvider struct { #### Example -+ The following is an example to enable prometheus metric. ++ The following is an example to disable prometheus metric. ``` yaml apiVersion: gateway.envoyproxy.io/v1alpha1 @@ -188,7 +183,7 @@ provider: telemetry: metrics: prometheus: - disable: false + disable: true ``` + The following is an example to send metric via Open Telemetry sink to OTEL gRPC Collector. @@ -213,7 +208,7 @@ telemetry: protocol: grpc ``` -+ The following is an example to enable prometheus metric and send metric via Open Telemetry sink to OTEL HTTP Collector at the same time. ++ The following is an example to disable prometheus metric and send metric via Open Telemetry sink to OTEL HTTP Collector at the same time. ``` yaml apiVersion: gateway.envoyproxy.io/v1alpha1 From 57a2ffa2fc9ea374f63371a469070215e2bf61c3 Mon Sep 17 00:00:00 2001 From: bitliu Date: Tue, 24 Oct 2023 15:38:55 +0800 Subject: [PATCH 4/4] update Signed-off-by: bitliu --- site/content/en/latest/design/eg-metrics.md | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/site/content/en/latest/design/eg-metrics.md b/site/content/en/latest/design/eg-metrics.md index e3879405b4f..f43af77be12 100644 --- a/site/content/en/latest/design/eg-metrics.md +++ b/site/content/en/latest/design/eg-metrics.md @@ -3,14 +3,6 @@ date: 2023-10-10 title: "Control Plane Observability: Metrics" --- -{{% alert title="State" color="warning" %}} - -+ Author: [Xunzhuo Liu](https://github.com/Xunzhuo) -+ Affiliation: Tencent -+ Data: 2023-10-12 -+ Status: Done -{{% /alert %}} - This document aims to cover all aspects of envoy gateway control plane metrics observability. {{% alert title="Note" color="secondary" %}} @@ -26,7 +18,7 @@ At present, the Envoy Gateway control plane provides logs and controller-runtime Our objectives include: + Supporting **PULL** mode for Prometheus metrics and exposing these metrics on the admin address. -+ Supporting **PUSH** mode for Prometheus metrics, thereby sending metrics to the Open Telemetry Stats sink. ++ Supporting **PUSH** mode for Prometheus metrics, thereby sending metrics to the Open Telemetry Stats sink via gRPC or HTTP. ## Non-Goals