From aa4e3a0f5add9d6ccdd4b8973cc3da085f84ffff Mon Sep 17 00:00:00 2001 From: Shyunn Date: Thu, 11 Apr 2024 11:18:33 +0800 Subject: [PATCH] feat: add trace for rate-limit (#2974) * feat: support trace of ratelimit Signed-off-by: ShyunnY <1147212064@qq.com> * fix: add json tag Signed-off-by: ShyunnY <1147212064@qq.com> * fix: use OTEL_EXPORTER_OTLP_ENDPOINT env Signed-off-by: ShyunnY <1147212064@qq.com> * fix Signed-off-by: ShyunnY <1147212064@qq.com> * docs: update docs Signed-off-by: yuluo-yx --------- Signed-off-by: ShyunnY <1147212064@qq.com> Signed-off-by: yuluo-yx Co-authored-by: yuluo-yx Co-authored-by: zirain --- api/v1alpha1/envoygateway_types.go | 31 ++++ api/v1alpha1/zz_generated.deepcopy.go | 50 ++++++ .../kubernetes/ratelimit/resource.go | 96 ++++++++++- .../kubernetes/ratelimit/resource_provider.go | 3 +- .../ratelimit/resource_provider_test.go | 40 +++++ .../kubernetes/ratelimit/resource_test.go | 40 +++++ .../deployments/enable-tracing-custom.yaml | 160 ++++++++++++++++++ .../testdata/deployments/enable-tracing.yaml | 160 ++++++++++++++++++ site/content/en/latest/api/extension_types.md | 33 ++++ .../observability/rate-limit-observability.md | 70 ++++++++ 10 files changed, 679 insertions(+), 4 deletions(-) create mode 100644 internal/infrastructure/kubernetes/ratelimit/resource_test.go create mode 100644 internal/infrastructure/kubernetes/ratelimit/testdata/deployments/enable-tracing-custom.yaml create mode 100644 internal/infrastructure/kubernetes/ratelimit/testdata/deployments/enable-tracing.yaml create mode 100644 site/content/en/latest/tasks/observability/rate-limit-observability.md diff --git a/api/v1alpha1/envoygateway_types.go b/api/v1alpha1/envoygateway_types.go index 47b9861e170..ade9e056b1e 100644 --- a/api/v1alpha1/envoygateway_types.go +++ b/api/v1alpha1/envoygateway_types.go @@ -354,6 +354,9 @@ type RateLimit struct { type RateLimitTelemetry struct { // Metrics defines metrics configuration for RateLimit. Metrics *RateLimitMetrics `json:"metrics,omitempty"` + + // Tracing defines traces configuration for RateLimit. + Tracing *RateLimitTracing `json:"tracing,omitempty"` } type RateLimitMetrics struct { @@ -366,6 +369,34 @@ type RateLimitMetricsPrometheusProvider struct { Disable bool `json:"disable,omitempty"` } +type RateLimitTracing struct { + // SamplingRate controls the rate at which traffic will be + // selected for tracing if no prior sampling decision has been made. + // Defaults to 100, valid values [0-100]. 100 indicates 100% sampling. + // +optional + SamplingRate *uint32 `json:"samplingRate,omitempty"` + + // Provider defines the rateLimit tracing provider. + // Only OpenTelemetry is supported currently. + Provider *RateLimitTracingProvider `json:"provider,omitempty"` +} + +type RateLimitTracingProviderType string + +const ( + RateLimitTracingProviderTypeOpenTelemetry TracingProviderType = "OpenTelemetry" +) + +// RateLimitTracingProvider defines the tracing provider configuration of RateLimit +type RateLimitTracingProvider struct { + // Type defines the tracing provider type. + // Since to RateLimit Exporter currently using OpenTelemetry, only OpenTelemetry is supported + Type *RateLimitTracingProviderType `json:"type,omitempty"` + + // URL is the endpoint of the trace collector that supports the OTLP protocol + URL string `json:"url"` +} + // RateLimitDatabaseBackend defines the configuration associated with // the database backend used by the rate limit service. // +union diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index e82cda7787f..b1e849077bd 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -3298,6 +3298,11 @@ func (in *RateLimitTelemetry) DeepCopyInto(out *RateLimitTelemetry) { *out = new(RateLimitMetrics) (*in).DeepCopyInto(*out) } + if in.Tracing != nil { + in, out := &in.Tracing, &out.Tracing + *out = new(RateLimitTracing) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RateLimitTelemetry. @@ -3310,6 +3315,51 @@ func (in *RateLimitTelemetry) DeepCopy() *RateLimitTelemetry { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RateLimitTracing) DeepCopyInto(out *RateLimitTracing) { + *out = *in + if in.SamplingRate != nil { + in, out := &in.SamplingRate, &out.SamplingRate + *out = new(uint32) + **out = **in + } + if in.Provider != nil { + in, out := &in.Provider, &out.Provider + *out = new(RateLimitTracingProvider) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RateLimitTracing. +func (in *RateLimitTracing) DeepCopy() *RateLimitTracing { + if in == nil { + return nil + } + out := new(RateLimitTracing) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RateLimitTracingProvider) DeepCopyInto(out *RateLimitTracingProvider) { + *out = *in + if in.Type != nil { + in, out := &in.Type, &out.Type + *out = new(RateLimitTracingProviderType) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RateLimitTracingProvider. +func (in *RateLimitTracingProvider) DeepCopy() *RateLimitTracingProvider { + if in == nil { + return nil + } + out := new(RateLimitTracingProvider) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RateLimitValue) DeepCopyInto(out *RateLimitValue) { *out = *in diff --git a/internal/infrastructure/kubernetes/ratelimit/resource.go b/internal/infrastructure/kubernetes/ratelimit/resource.go index 32dfba94c1b..7e7a9d3722d 100644 --- a/internal/infrastructure/kubernetes/ratelimit/resource.go +++ b/internal/infrastructure/kubernetes/ratelimit/resource.go @@ -79,6 +79,18 @@ const ( ConfigGrpcXdsServerURLEnvVar = "CONFIG_GRPC_XDS_SERVER_URL" // ConfigGrpcXdsNodeIDEnvVar is the id of ratelimit node. ConfigGrpcXdsNodeIDEnvVar = "CONFIG_GRPC_XDS_NODE_ID" + // TracingEnabledVar is enabled the tracing feature + TracingEnabledVar = "TRACING_ENABLED" + // TracingServiceNameVar is service name appears in tracing span + TracingServiceNameVar = "TRACING_SERVICE_NAME" + // TracingServiceNamespaceVar is service namespace appears in tracing span + TracingServiceNamespaceVar = "TRACING_SERVICE_NAMESPACE" + // TracingServiceInstanceIDVar is service instance id appears in tracing span + TracingServiceInstanceIDVar = "TRACING_SERVICE_INSTANCE_ID" + // TracingSamplingRateVar is trace sampling rate + TracingSamplingRateVar = "TRACING_SAMPLING_RATE" + // OTELExporterOTLPTraceEndpointVar is target url to which the trace exporter is going to send + OTELExporterOTLPTraceEndpointVar = "OTEL_EXPORTER_OTLP_ENDPOINT" // InfraName is the name for rate-limit resources. InfraName = "envoy-ratelimit" @@ -125,7 +137,8 @@ func rateLimitLabels() map[string]string { } // expectedRateLimitContainers returns expected rateLimit containers. -func expectedRateLimitContainers(rateLimit *egv1a1.RateLimit, rateLimitDeployment *egv1a1.KubernetesDeploymentSpec) []corev1.Container { +func expectedRateLimitContainers(rateLimit *egv1a1.RateLimit, rateLimitDeployment *egv1a1.KubernetesDeploymentSpec, + namespace string) []corev1.Container { ports := []corev1.ContainerPort{ { Name: "grpc", @@ -142,7 +155,7 @@ func expectedRateLimitContainers(rateLimit *egv1a1.RateLimit, rateLimitDeploymen Command: []string{ "/bin/ratelimit", }, - Env: expectedRateLimitContainerEnv(rateLimit, rateLimitDeployment), + Env: expectedRateLimitContainerEnv(rateLimit, rateLimitDeployment, namespace), Ports: ports, Resources: *rateLimitDeployment.Container.Resources, SecurityContext: rateLimitDeployment.Container.SecurityContext, @@ -275,7 +288,8 @@ func expectedDeploymentVolumes(rateLimit *egv1a1.RateLimit, rateLimitDeployment } // expectedRateLimitContainerEnv returns expected rateLimit container envs. -func expectedRateLimitContainerEnv(rateLimit *egv1a1.RateLimit, rateLimitDeployment *egv1a1.KubernetesDeploymentSpec) []corev1.EnvVar { +func expectedRateLimitContainerEnv(rateLimit *egv1a1.RateLimit, rateLimitDeployment *egv1a1.KubernetesDeploymentSpec, + namespace string) []corev1.EnvVar { env := []corev1.EnvVar{ { Name: RuntimeRootEnvVar, @@ -384,6 +398,54 @@ func expectedRateLimitContainerEnv(rateLimit *egv1a1.RateLimit, rateLimitDeploym } } + if enableTracing(rateLimit) { + var sampleRate = 1.0 + if rateLimit.Telemetry.Tracing.SamplingRate != nil { + sampleRate = float64(*rateLimit.Telemetry.Tracing.SamplingRate) / 100.0 + } + + traceEndpoint := checkTraceEndpointScheme(rateLimit.Telemetry.Tracing.Provider.URL) + tracingEnvs := []corev1.EnvVar{ + { + Name: TracingEnabledVar, + Value: "true", + }, + { + Name: TracingServiceNameVar, + Value: InfraName, + }, + { + Name: TracingServiceNamespaceVar, + Value: namespace, + }, + { + // By default, this is a random instanceID, + // we use the RateLimit pod name as the trace service instanceID. + Name: TracingServiceInstanceIDVar, + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + APIVersion: "v1", + FieldPath: "metadata.name", + }, + }, + }, + { + Name: TracingSamplingRateVar, + // The api is configured with [0,100], but sampling can only be [0,1]. + // doc: https://github.com/envoyproxy/ratelimit?tab=readme-ov-file#tracing + // You will lose precision during the conversion process, but don't worry, + // this follows the rounding rule and won't make the expected sampling rate too different + // from the actual sampling rate + Value: strconv.FormatFloat(sampleRate, 'f', 1, 64), + }, + { + Name: OTELExporterOTLPTraceEndpointVar, + Value: traceEndpoint, + }, + } + env = append(env, tracingEnvs...) + } + return resource.ExpectedContainerEnv(rateLimitDeployment.Container, env) } @@ -399,3 +461,31 @@ func Validate(ctx context.Context, client client.Client, gateway *egv1a1.EnvoyGa return nil } + +func enableTracing(rl *egv1a1.RateLimit) bool { + // Other fields can use the default values, + // but we have to make sure the user has the Provider.URL + if rl != nil && rl.Telemetry != nil && + rl.Telemetry.Tracing != nil && + rl.Telemetry.Tracing.Provider != nil && + len(rl.Telemetry.Tracing.Provider.URL) != 0 { + return true + } + + return false +} + +// checkTraceEndpointScheme Check the scheme prefix in the trace url +func checkTraceEndpointScheme(url string) string { + // Since the OTLP collector needs to configure the scheme prefix, + // we need to check if the user has configured this + // TODO: It is currently assumed to be a normal connection, + // and a TLS connection will be added later. + httpScheme := "http://" + exist := strings.HasPrefix(url, httpScheme) + if exist { + return url + } + + return fmt.Sprintf("%s%s", httpScheme, url) +} diff --git a/internal/infrastructure/kubernetes/ratelimit/resource_provider.go b/internal/infrastructure/kubernetes/ratelimit/resource_provider.go index 90f646d014f..885cb4ddca6 100644 --- a/internal/infrastructure/kubernetes/ratelimit/resource_provider.go +++ b/internal/infrastructure/kubernetes/ratelimit/resource_provider.go @@ -61,6 +61,7 @@ func (r *ResourceRender) Name() string { func enablePrometheus(rl *egv1a1.RateLimit) bool { if rl != nil && rl.Telemetry != nil && + rl.Telemetry.Metrics != nil && rl.Telemetry.Metrics.Prometheus != nil { return !rl.Telemetry.Metrics.Prometheus.Disable } @@ -183,7 +184,7 @@ func (r *ResourceRender) ServiceAccount() (*corev1.ServiceAccount, error) { // Deployment returns the expected rate limit Deployment based on the provided infra. func (r *ResourceRender) Deployment() (*appsv1.Deployment, error) { - containers := expectedRateLimitContainers(r.rateLimit, r.rateLimitDeployment) + containers := expectedRateLimitContainers(r.rateLimit, r.rateLimitDeployment, r.Namespace) labels := rateLimitLabels() selector := resource.GetSelector(labels) diff --git a/internal/infrastructure/kubernetes/ratelimit/resource_provider_test.go b/internal/infrastructure/kubernetes/ratelimit/resource_provider_test.go index 6c56631d9cc..52aec1fabed 100644 --- a/internal/infrastructure/kubernetes/ratelimit/resource_provider_test.go +++ b/internal/infrastructure/kubernetes/ratelimit/resource_provider_test.go @@ -648,6 +648,46 @@ func TestDeployment(t *testing.T) { }, }, }, + { + caseName: "enable-tracing", + rateLimit: &egv1a1.RateLimit{ + Backend: egv1a1.RateLimitDatabaseBackend{ + Type: egv1a1.RedisBackendType, + Redis: &egv1a1.RateLimitRedisSettings{ + URL: "redis.redis.svc:6379", + }, + }, + Telemetry: &egv1a1.RateLimitTelemetry{ + Tracing: &egv1a1.RateLimitTracing{ + Provider: &egv1a1.RateLimitTracingProvider{ + URL: "http://trace-collector.envoy-gateway-system.svc.cluster.local:4318", + }, + }, + }, + }, + }, + { + caseName: "enable-tracing-custom", + rateLimit: &egv1a1.RateLimit{ + Backend: egv1a1.RateLimitDatabaseBackend{ + Type: egv1a1.RedisBackendType, + Redis: &egv1a1.RateLimitRedisSettings{ + URL: "redis.redis.svc:6379", + }, + }, + Telemetry: &egv1a1.RateLimitTelemetry{ + Tracing: &egv1a1.RateLimitTracing{ + SamplingRate: func() *uint32 { + var samplingRate uint32 = 55 + return &samplingRate + }(), + Provider: &egv1a1.RateLimitTracingProvider{ + URL: "trace-collector.envoy-gateway-system.svc.cluster.local:4317", + }, + }, + }, + }, + }, } for _, tc := range cases { t.Run(tc.caseName, func(t *testing.T) { diff --git a/internal/infrastructure/kubernetes/ratelimit/resource_test.go b/internal/infrastructure/kubernetes/ratelimit/resource_test.go new file mode 100644 index 00000000000..71179c8c7c4 --- /dev/null +++ b/internal/infrastructure/kubernetes/ratelimit/resource_test.go @@ -0,0 +1,40 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package ratelimit + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestCheckTraceEndpointScheme(t *testing.T) { + + cases := []struct { + caseName string + actualURL string + expectedURL string + }{ + { + caseName: "normal url with http prefix", + actualURL: "http://collector.observability.svc.cluster.local:4318", + expectedURL: "http://collector.observability.svc.cluster.local:4318", + }, + { + caseName: "abnormal url without http prefix", + actualURL: "collector.observability.svc.cluster.local:4318", + expectedURL: "http://collector.observability.svc.cluster.local:4318", + }, + } + + for _, tc := range cases { + t.Run(tc.caseName, func(t *testing.T) { + actual := checkTraceEndpointScheme(tc.actualURL) + require.Equal(t, tc.expectedURL, actual) + }) + } + +} diff --git a/internal/infrastructure/kubernetes/ratelimit/testdata/deployments/enable-tracing-custom.yaml b/internal/infrastructure/kubernetes/ratelimit/testdata/deployments/enable-tracing-custom.yaml new file mode 100644 index 00000000000..b4c7d9472e9 --- /dev/null +++ b/internal/infrastructure/kubernetes/ratelimit/testdata/deployments/enable-tracing-custom.yaml @@ -0,0 +1,160 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: ratelimit + app.kubernetes.io/managed-by: envoy-gateway + app.kubernetes.io/name: envoy-ratelimit + name: envoy-ratelimit + namespace: envoy-gateway-system + ownerReferences: + - apiVersion: apps/v1 + kind: Deployment + name: envoy-gateway + uid: test-owner-reference-uid-for-deployment +spec: + progressDeadlineSeconds: 600 + revisionHistoryLimit: 10 + selector: + matchLabels: + app.kubernetes.io/component: ratelimit + app.kubernetes.io/managed-by: envoy-gateway + app.kubernetes.io/name: envoy-ratelimit + strategy: + type: RollingUpdate + template: + metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: "19001" + prometheus.io/scrape: "true" + creationTimestamp: null + labels: + app.kubernetes.io/component: ratelimit + app.kubernetes.io/managed-by: envoy-gateway + app.kubernetes.io/name: envoy-ratelimit + spec: + automountServiceAccountToken: false + containers: + - command: + - /bin/ratelimit + env: + - name: RUNTIME_ROOT + value: /data + - name: RUNTIME_SUBDIRECTORY + value: ratelimit + - name: RUNTIME_IGNOREDOTFILES + value: "true" + - name: RUNTIME_WATCH_ROOT + value: "false" + - name: LOG_LEVEL + value: info + - name: USE_STATSD + value: "false" + - name: CONFIG_TYPE + value: GRPC_XDS_SOTW + - name: CONFIG_GRPC_XDS_SERVER_URL + value: envoy-gateway:18001 + - name: CONFIG_GRPC_XDS_NODE_ID + value: envoy-ratelimit + - name: GRPC_SERVER_USE_TLS + value: "true" + - name: GRPC_SERVER_TLS_CERT + value: /certs/tls.crt + - name: GRPC_SERVER_TLS_KEY + value: /certs/tls.key + - name: GRPC_SERVER_TLS_CA_CERT + value: /certs/ca.crt + - name: CONFIG_GRPC_XDS_SERVER_USE_TLS + value: "true" + - name: CONFIG_GRPC_XDS_CLIENT_TLS_CERT + value: /certs/tls.crt + - name: CONFIG_GRPC_XDS_CLIENT_TLS_KEY + value: /certs/tls.key + - name: CONFIG_GRPC_XDS_SERVER_TLS_CACERT + value: /certs/ca.crt + - name: FORCE_START_WITHOUT_INITIAL_CONFIG + value: "true" + - name: REDIS_SOCKET_TYPE + value: tcp + - name: REDIS_URL + value: redis.redis.svc:6379 + - name: TRACING_ENABLED + value: "true" + - name: TRACING_SERVICE_NAME + value: envoy-ratelimit + - name: TRACING_SERVICE_NAMESPACE + value: envoy-gateway-system + - name: TRACING_SERVICE_INSTANCE_ID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: TRACING_SAMPLING_RATE + value: "0.6" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: http://trace-collector.envoy-gateway-system.svc.cluster.local:4317 + image: envoyproxy/ratelimit:master + imagePullPolicy: IfNotPresent + name: envoy-ratelimit + ports: + - containerPort: 8081 + name: grpc + protocol: TCP + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthcheck + port: 8080 + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: 100m + memory: 512Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /certs + name: certs + readOnly: true + - command: + - /bin/statsd_exporter + - --web.listen-address=:19001 + - --statsd.mapping-config=/etc/statsd-exporter/conf.yaml + image: prom/statsd-exporter:v0.18.0 + imagePullPolicy: IfNotPresent + name: prom-statsd-exporter + ports: + - containerPort: 9125 + name: statsd + protocol: TCP + - containerPort: 19001 + name: metrics + protocol: TCP + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /etc/statsd-exporter + name: statsd-exporter-config + readOnly: true + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + serviceAccountName: envoy-ratelimit + terminationGracePeriodSeconds: 300 + volumes: + - name: certs + secret: + defaultMode: 420 + secretName: envoy-rate-limit + - configMap: + defaultMode: 420 + name: statsd-exporter-config + optional: true + name: statsd-exporter-config +status: {} diff --git a/internal/infrastructure/kubernetes/ratelimit/testdata/deployments/enable-tracing.yaml b/internal/infrastructure/kubernetes/ratelimit/testdata/deployments/enable-tracing.yaml new file mode 100644 index 00000000000..e36ff5ef87d --- /dev/null +++ b/internal/infrastructure/kubernetes/ratelimit/testdata/deployments/enable-tracing.yaml @@ -0,0 +1,160 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + creationTimestamp: null + labels: + app.kubernetes.io/component: ratelimit + app.kubernetes.io/managed-by: envoy-gateway + app.kubernetes.io/name: envoy-ratelimit + name: envoy-ratelimit + namespace: envoy-gateway-system + ownerReferences: + - apiVersion: apps/v1 + kind: Deployment + name: envoy-gateway + uid: test-owner-reference-uid-for-deployment +spec: + progressDeadlineSeconds: 600 + revisionHistoryLimit: 10 + selector: + matchLabels: + app.kubernetes.io/component: ratelimit + app.kubernetes.io/managed-by: envoy-gateway + app.kubernetes.io/name: envoy-ratelimit + strategy: + type: RollingUpdate + template: + metadata: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: "19001" + prometheus.io/scrape: "true" + creationTimestamp: null + labels: + app.kubernetes.io/component: ratelimit + app.kubernetes.io/managed-by: envoy-gateway + app.kubernetes.io/name: envoy-ratelimit + spec: + automountServiceAccountToken: false + containers: + - command: + - /bin/ratelimit + env: + - name: RUNTIME_ROOT + value: /data + - name: RUNTIME_SUBDIRECTORY + value: ratelimit + - name: RUNTIME_IGNOREDOTFILES + value: "true" + - name: RUNTIME_WATCH_ROOT + value: "false" + - name: LOG_LEVEL + value: info + - name: USE_STATSD + value: "false" + - name: CONFIG_TYPE + value: GRPC_XDS_SOTW + - name: CONFIG_GRPC_XDS_SERVER_URL + value: envoy-gateway:18001 + - name: CONFIG_GRPC_XDS_NODE_ID + value: envoy-ratelimit + - name: GRPC_SERVER_USE_TLS + value: "true" + - name: GRPC_SERVER_TLS_CERT + value: /certs/tls.crt + - name: GRPC_SERVER_TLS_KEY + value: /certs/tls.key + - name: GRPC_SERVER_TLS_CA_CERT + value: /certs/ca.crt + - name: CONFIG_GRPC_XDS_SERVER_USE_TLS + value: "true" + - name: CONFIG_GRPC_XDS_CLIENT_TLS_CERT + value: /certs/tls.crt + - name: CONFIG_GRPC_XDS_CLIENT_TLS_KEY + value: /certs/tls.key + - name: CONFIG_GRPC_XDS_SERVER_TLS_CACERT + value: /certs/ca.crt + - name: FORCE_START_WITHOUT_INITIAL_CONFIG + value: "true" + - name: REDIS_SOCKET_TYPE + value: tcp + - name: REDIS_URL + value: redis.redis.svc:6379 + - name: TRACING_ENABLED + value: "true" + - name: TRACING_SERVICE_NAME + value: envoy-ratelimit + - name: TRACING_SERVICE_NAMESPACE + value: envoy-gateway-system + - name: TRACING_SERVICE_INSTANCE_ID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: TRACING_SAMPLING_RATE + value: "1.0" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: http://trace-collector.envoy-gateway-system.svc.cluster.local:4318 + image: envoyproxy/ratelimit:master + imagePullPolicy: IfNotPresent + name: envoy-ratelimit + ports: + - containerPort: 8081 + name: grpc + protocol: TCP + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthcheck + port: 8080 + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: 100m + memory: 512Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /certs + name: certs + readOnly: true + - command: + - /bin/statsd_exporter + - --web.listen-address=:19001 + - --statsd.mapping-config=/etc/statsd-exporter/conf.yaml + image: prom/statsd-exporter:v0.18.0 + imagePullPolicy: IfNotPresent + name: prom-statsd-exporter + ports: + - containerPort: 9125 + name: statsd + protocol: TCP + - containerPort: 19001 + name: metrics + protocol: TCP + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /etc/statsd-exporter + name: statsd-exporter-config + readOnly: true + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + serviceAccountName: envoy-ratelimit + terminationGracePeriodSeconds: 300 + volumes: + - name: certs + secret: + defaultMode: 420 + secretName: envoy-rate-limit + - configMap: + defaultMode: 420 + name: statsd-exporter-config + optional: true + name: statsd-exporter-config +status: {} diff --git a/site/content/en/latest/api/extension_types.md b/site/content/en/latest/api/extension_types.md index c3316f67026..7234b805d8f 100644 --- a/site/content/en/latest/api/extension_types.md +++ b/site/content/en/latest/api/extension_types.md @@ -2354,6 +2354,39 @@ _Appears in:_ | Field | Type | Required | Description | | --- | --- | --- | --- | | `metrics` | _[RateLimitMetrics](#ratelimitmetrics)_ | true | Metrics defines metrics configuration for RateLimit. | +| `tracing` | _[RateLimitTracing](#ratelimittracing)_ | true | Tracing defines traces configuration for RateLimit. | + + +#### RateLimitTracing + + + + + +_Appears in:_ +- [RateLimitTelemetry](#ratelimittelemetry) + +| Field | Type | Required | Description | +| --- | --- | --- | --- | +| `samplingRate` | _integer_ | false | SamplingRate controls the rate at which traffic will be
selected for tracing if no prior sampling decision has been made.
Defaults to 100, valid values [0-100]. 100 indicates 100% sampling. | +| `provider` | _[RateLimitTracingProvider](#ratelimittracingprovider)_ | true | Provider defines the rateLimit tracing provider.
Only OpenTelemetry is supported currently. | + + +#### RateLimitTracingProvider + + + +RateLimitTracingProvider defines the tracing provider configuration of RateLimit + +_Appears in:_ +- [RateLimitTracing](#ratelimittracing) + +| Field | Type | Required | Description | +| --- | --- | --- | --- | +| `type` | _[RateLimitTracingProviderType](#ratelimittracingprovidertype)_ | true | Type defines the tracing provider type.
Since to RateLimit Exporter currently using OpenTelemetry, only OpenTelemetry is supported | +| `url` | _string_ | true | URL is the endpoint of the trace collector that supports the OTLP protocol | + + #### RateLimitType diff --git a/site/content/en/latest/tasks/observability/rate-limit-observability.md b/site/content/en/latest/tasks/observability/rate-limit-observability.md new file mode 100644 index 00000000000..350be4dc4b1 --- /dev/null +++ b/site/content/en/latest/tasks/observability/rate-limit-observability.md @@ -0,0 +1,70 @@ +--- +title: "RateLimit Observability" +--- + +Envoy Gateway provides observability for the RateLimit instances. +This guide show you how to config RateLimit observability, includes traces. + +## Prerequisites + +Follow the steps from the [Quickstart Guide](../quickstart) to install Envoy Gateway and the HTTPRoute example manifest. +Before proceeding, you should be able to query the example backend using HTTP. Follow the steps from the [Global Rate Limit](../traffic/global-rate-limit) to install RateLimit. + +[OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) offers a vendor-agnostic implementation of how to receive, process and export telemetry data. + +Install OTel-Collector: + +```shell +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +helm repo update +helm upgrade --install otel-collector open-telemetry/opentelemetry-collector -f https://raw.githubusercontent.com/envoyproxy/gateway/latest/examples/otel-collector/helm-values.yaml -n monitoring --create-namespace --version 0.60.0 +``` + +## Traces + +By default, the Envoy Gateway does not configure RateLimit to send traces to the OpenTelemetry Sink. +You can configure the collector in the `rateLimit.telemetry.tracing` of the `EnvoyGateway`CRD. + +RateLimit uses the OpenTelemetry Exporter to export traces to the collector. +You can configure a collector that supports the OTLP protocol, which includes but is not limited to: OpenTelemetry Collector, Jaeger, Zipkin, and so on. + +***Note:*** + +* By default, the Envoy Gateway configures a `100%` sampling rate for RateLimit, which may lead to performance issues. + +Assuming the OpenTelemetry Collector is running in the `observability` namespace, and it has a service named `otel-svc`, +we only want to sample `50%` of the trace data. We would configure it as follows: + +```shell +cat <