fix: fix test

Signed-off-by: ShyunnY <1147212064@qq.com> fix: update refactor: rl trace Signed-off-by: ShyunnY <1147212064@qq.com>
envoyproxy · Mar 28, 2024 · e3babaa · e3babaa
1 parent 323b432
commit e3babaa
Show file tree

Hide file tree

Showing 12 changed files with 706 additions and 443 deletions.
diff --git a/api/v1alpha1/envoygateway_types.go b/api/v1alpha1/envoygateway_types.go
@@ -355,39 +355,26 @@ type RateLimitMetricsPrometheusProvider struct {
 }
 
 type RateLimitTracing struct {
-	// TracingServiceName defines the service name appears in tracing span.
-	// The default value is "envoy-ratelimit"
-	TracingServiceName string `json:"tracingServiceName,omitempty"`
-
-	// TracingServiceNamespace defines the service namespace appears in tracing span.
-	// The default value is namespace where the RateLimit resides
-	TracingServiceNamespace string `json:"tracingServiceNamespace,omitempty"`
-
-	// TracingSampleRate defines the sampling rate, defaults to 1.0 which means always sample.
-	// Valid range: [0.0,1.0] For high volume services, adjusting the sampling rate is recommended.
-	TracingSampleRate *float64 `json:"tracingSampleRate,omitempty"`
+	// SamplingRate controls the rate at which traffic will be
+	// selected for tracing if no prior sampling decision has been made.
+	// Defaults to 100, valid values [0-100]. 100 indicates 100% sampling.
+	// +optional
+	SampleRate *uint32 `json:"tracingSampleRate"`
 
-	// Provider defines the tracing provider.
-	Provider *RateLimitTraceProvider `json:"provider,omitempty"`
-}
+	// BackendRef defines the target trace collector endpoint configuration
+	BackendRef gwapiv1.BackendObjectReference `json:"backendRef"`
 
-type RateLimitTraceProvider struct {
 	// Protocol defines the protocol of provider in tracing feature.
 	// Only "http"(default) and "grpc" are allowed in this field
+	// +optional
 	Protocol string `json:"protocol,omitempty"`
 
-	// Endpoint defines target URL to which the provider is going to send traces.
-	// The endpoint must be a valid URL with scheme (http or https) and host, may contain a port,
-	// should contain a path and must not contain other parts (such as query string or fragment).
-	Endpoint string `json:"endpoint,omitempty"`
-
-	// Insecure Whether to enable client transport security for the provider gRPC connection.
-	// Default is true
-	Insecure *bool `json:"insecure,omitempty"`
-
-	// Timeout Maximum time the provider will wait for each batch export.
-	// The time format follows the Go time package, such as "300ms", "-1.5h" or "2h45m".
-	Timeout string `json:"timeout,omitempty"`
+	// ClusterDomain is an optional field that specifies the custom domain used for the Kubernetes cluster.
+	// This field is used when the cluster is configured with a custom DNS domain,
+	// different from the default "cluster.local".
+	// Envoy Gateway uses this custom domain to generate fully qualified domain names (FQDN) for trace gatherer services.
+	// +optional
+	ClusterDomain string `json:"clusterDomain,omitempty"`
 }
 
 // RateLimitDatabaseBackend defines the configuration associated with

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/internal/infrastructure/kubernetes/ratelimit/resource.go b/internal/infrastructure/kubernetes/ratelimit/resource.go
@@ -16,6 +16,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
+	gwapiv1 "sigs.k8s.io/gateway-api/apis/v1"
 
 	egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1"
 	"github.com/envoyproxy/gateway/internal/infrastructure/kubernetes/resource"
@@ -93,12 +94,10 @@ const (
 	TracingSamplingRateVar = "TRACING_SAMPLING_RATE"
 	// OTELExporterOTLPEndpointVar is target url to which the exporter is going to send
 	OTELExporterOTLPEndpointVar = "OTEL_EXPORTER_OTLP_ENDPOINT"
-	// OTELExporterOTLPInsecure is enable client transport security for the exporter's gRPC connection.
-	OTELExporterOTLPInsecure = "OTEL_EXPORTER_OTLP_INSECURE"
-	// OTELExporterOTLPTimeoutVar Maximum time the OTLP exporter will wait for each batch export.
-	OTELExporterOTLPTimeoutVar = "OTEL_EXPORTER_OTLP_TIMEOUT"
 	// InfraName is the name for rate-limit resources.
 	InfraName = "envoy-ratelimit"
+	// InfraNamespace is the namespace for rate-limit resources.
+	InfraNamespace = "envoy-gateway-system"
 	// InfraGRPCPort is the grpc port that the rate limit service listens on.
 	InfraGRPCPort = 8081
 	// XdsGrpcSotwConfigServerPort is the listening port of the ratelimit xDS config server.
@@ -142,7 +141,7 @@ func rateLimitLabels() map[string]string {
 }
 
 // expectedRateLimitContainers returns expected rateLimit containers.
-func expectedRateLimitContainers(rateLimit *egv1a1.RateLimit, rateLimitDeployment *egv1a1.KubernetesDeploymentSpec) []corev1.Container {
+func expectedRateLimitContainers(rateLimit *egv1a1.RateLimit, rateLimitDeployment *egv1a1.KubernetesDeploymentSpec, namespace string) []corev1.Container {
 	ports := []corev1.ContainerPort{
 		{
 			Name:          "grpc",
@@ -159,7 +158,7 @@ func expectedRateLimitContainers(rateLimit *egv1a1.RateLimit, rateLimitDeploymen
 			Command: []string{
 				"/bin/ratelimit",
 			},
-			Env:                      expectedRateLimitContainerEnv(rateLimit, rateLimitDeployment),
+			Env:                      expectedRateLimitContainerEnv(rateLimit, rateLimitDeployment, namespace),
 			Ports:                    ports,
 			Resources:                *rateLimitDeployment.Container.Resources,
 			SecurityContext:          rateLimitDeployment.Container.SecurityContext,
@@ -290,7 +289,7 @@ func expectedDeploymentVolumes(rateLimit *egv1a1.RateLimit, rateLimitDeployment
 }
 
 // expectedRateLimitContainerEnv returns expected rateLimit container envs.
-func expectedRateLimitContainerEnv(rateLimit *egv1a1.RateLimit, rateLimitDeployment *egv1a1.KubernetesDeploymentSpec) []corev1.EnvVar {
+func expectedRateLimitContainerEnv(rateLimit *egv1a1.RateLimit, rateLimitDeployment *egv1a1.KubernetesDeploymentSpec, namespace string) []corev1.EnvVar {
 	env := []corev1.EnvVar{
 		{
 			Name:  RedisSocketTypeEnvVar,
@@ -395,38 +394,40 @@ func expectedRateLimitContainerEnv(rateLimit *egv1a1.RateLimit, rateLimitDeploym
 	}
 
 	if enableTracing(rateLimit) {
-
-		if len(rateLimit.Telemetry.Tracing.TracingServiceName) == 0 {
-			rateLimit.Telemetry.Tracing.TracingServiceName = InfraName
+		var protocol = "http"
+		if len(rateLimit.Telemetry.Tracing.Protocol) != 0 {
+			protocol = rateLimit.Telemetry.Tracing.Protocol
 		}
 
 		var sampleRate = 1.0
-		if rateLimit.Telemetry.Tracing.TracingSampleRate != nil {
-			sampleRate = *rateLimit.Telemetry.Tracing.TracingSampleRate
+		if rateLimit.Telemetry.Tracing.SampleRate != nil {
+			sampleRate = float64(*rateLimit.Telemetry.Tracing.SampleRate) / 100.0
 		}
 
-		var insecure = true
-		if rateLimit.Telemetry.Tracing.Provider.Insecure != nil {
-			insecure = *rateLimit.Telemetry.Tracing.Provider.Insecure
-		}
-		if len(rateLimit.Telemetry.Tracing.Provider.Protocol) == 0 {
-			rateLimit.Telemetry.Tracing.Provider.Protocol = "http"
+		// If no namespace is provided, we assume that they are in the same namespace.
+		if rateLimit.Telemetry.Tracing.BackendRef.Namespace == nil {
+			ns := gwapiv1.Namespace(namespace)
+			rateLimit.Telemetry.Tracing.BackendRef.Namespace = &ns
 		}
 
+		targetEndpoint := buildTraceEndpoint(rateLimit)
+
 		tracingEnvs := []corev1.EnvVar{
 			{
 				Name:  TracingEnabledVar,
 				Value: "true",
 			},
 			{
 				Name:  TracingServiceNameVar,
-				Value: rateLimit.Telemetry.Tracing.TracingServiceName,
+				Value: InfraName,
 			},
 			{
 				Name:  TracingServiceNamespaceVar,
-				Value: rateLimit.Telemetry.Tracing.TracingServiceNamespace,
+				Value: namespace,
 			},
 			{
+				// By default, this is a random instanceID,
+				// we use the RateLimit pod name as the trace service instanceID.
 				Name: TracingServiceInstanceIDVar,
 				ValueFrom: &corev1.EnvVarSource{
 					FieldRef: &corev1.ObjectFieldSelector{
@@ -436,32 +437,23 @@ func expectedRateLimitContainerEnv(rateLimit *egv1a1.RateLimit, rateLimitDeploym
 				},
 			},
 			{
-				Name:  TracingSamplingRateVar,
+				Name: TracingSamplingRateVar,
+				// The api is configured with [0,100], but sampling can only be [0,1].
+				// doc: https://github.com/envoyproxy/ratelimit?tab=readme-ov-file#tracing
+				// You will lose precision during the conversion process, but don't worry,
+				// this follows the rounding rule and won't make the expected sampling rate too different
+				// from the actual sampling rate
 				Value: strconv.FormatFloat(sampleRate, 'f', 1, 64),
 			},
 			{
 				Name:  TracingExporterProtocolVar,
-				Value: rateLimit.Telemetry.Tracing.Provider.Protocol,
+				Value: protocol,
 			},
 			{
 				Name:  OTELExporterOTLPEndpointVar,
-				Value: rateLimit.Telemetry.Tracing.Provider.Endpoint,
-			},
-			{
-				Name:  OTELExporterOTLPInsecure,
-				Value: strconv.FormatBool(insecure),
+				Value: targetEndpoint,
 			},
 		}
-
-		if len(rateLimit.Telemetry.Tracing.Provider.Timeout) != 0 {
-			tracingEnvs = append(tracingEnvs, []corev1.EnvVar{
-				{
-					Name:  OTELExporterOTLPTimeoutVar,
-					Value: rateLimit.Telemetry.Tracing.Provider.Timeout,
-				},
-			}...)
-		}
-
 		env = append(env, tracingEnvs...)
 	}
 
@@ -480,14 +472,35 @@ func Validate(ctx context.Context, client client.Client, gateway *egv1a1.EnvoyGa
 }
 
 func enableTracing(rl *egv1a1.RateLimit) bool {
-
-	// Other fields can use the default values, but we have to make sure the user has the endpoint configured
+	// Other fields can use the default values,
+	// but we have to make sure the user has the BackendRef.Name
 	if rl != nil && rl.Telemetry != nil &&
 		rl.Telemetry.Tracing != nil &&
-		rl.Telemetry.Tracing.Provider != nil &&
-		len(rl.Telemetry.Tracing.Provider.Endpoint) != 0 {
+		len(rl.Telemetry.Tracing.BackendRef.Name) != 0 {
 		return true
 	}
 
 	return false
 }
+
+// buildTraceEndpoint Build the endpoint for the target trace collector
+func buildTraceEndpoint(rateLimit *egv1a1.RateLimit) string {
+	// By default, the cluster domain is "cluster.local",
+	// but there can be custom cluster domains that we need to deal with.
+	var clusterDomain = "cluster.local"
+	if len(rateLimit.Telemetry.Tracing.ClusterDomain) != 0 {
+		clusterDomain = rateLimit.Telemetry.Tracing.ClusterDomain
+	}
+
+	defaultPort := 4318
+	if rateLimit.Telemetry.Tracing.BackendRef.Port != nil {
+		defaultPort = int(*rateLimit.Telemetry.Tracing.BackendRef.Port)
+	}
+
+	return fmt.Sprintf("%s.%s.svc.%s:%d",
+		rateLimit.Telemetry.Tracing.BackendRef.Name,
+		string(*rateLimit.Telemetry.Tracing.BackendRef.Namespace),
+		clusterDomain,
+		defaultPort,
+	)
+}
diff --git a/internal/infrastructure/kubernetes/ratelimit/resource_provider.go b/internal/infrastructure/kubernetes/ratelimit/resource_provider.go
@@ -185,11 +185,7 @@ func (r *ResourceRender) ServiceAccount() (*corev1.ServiceAccount, error) {
 // Deployment returns the expected rate limit Deployment based on the provided infra.
 func (r *ResourceRender) Deployment() (*appsv1.Deployment, error) {
 
-	if enableTracing(r.rateLimit) && len(r.rateLimit.Telemetry.Tracing.TracingServiceNamespace) == 0 {
-		r.rateLimit.Telemetry.Tracing.TracingServiceNamespace = r.Namespace
-	}
-
-	containers := expectedRateLimitContainers(r.rateLimit, r.rateLimitDeployment)
+	containers := expectedRateLimitContainers(r.rateLimit, r.rateLimitDeployment, r.Namespace)
 	labels := rateLimitLabels()
 	selector := resource.GetSelector(labels)
 

diff --git a/internal/infrastructure/kubernetes/ratelimit/resource_provider_test.go b/internal/infrastructure/kubernetes/ratelimit/resource_provider_test.go
@@ -649,7 +649,7 @@ func TestDeployment(t *testing.T) {
 			},
 		},
 		{
-			caseName: "enable-tracing-feature-with-default",
+			caseName: "enable-tracing-with-default",
 			rateLimit: &egv1a1.RateLimit{
 				Backend: egv1a1.RateLimitDatabaseBackend{
 					Type: egv1a1.RedisBackendType,
@@ -659,15 +659,15 @@ func TestDeployment(t *testing.T) {
 				},
 				Telemetry: &egv1a1.RateLimitTelemetry{
 					Tracing: &egv1a1.RateLimitTracing{
-						Provider: &egv1a1.RateLimitTraceProvider{
-							Endpoint: "http://localhost:4318/v1/traces",
+						BackendRef: gwapiv1.BackendObjectReference{
+							Name: "trace-collector",
 						},
 					},
 				},
 			},
 		},
 		{
-			caseName: "enable-tracing-feature-with-custom",
+			caseName: "enable-tracing-with-custom",
 			rateLimit: &egv1a1.RateLimit{
 				Backend: egv1a1.RateLimitDatabaseBackend{
 					Type: egv1a1.RedisBackendType,
@@ -677,21 +677,49 @@ func TestDeployment(t *testing.T) {
 				},
 				Telemetry: &egv1a1.RateLimitTelemetry{
 					Tracing: &egv1a1.RateLimitTracing{
-						TracingServiceName:      "prod-rate-limit",
-						TracingServiceNamespace: "prod",
-						TracingSampleRate: func() *float64 {
-							sampleRate := 0.9
-							return &sampleRate
+						SampleRate: func() *uint32 {
+							var rate uint32 = 95
+							return &rate
 						}(),
-						Provider: &egv1a1.RateLimitTraceProvider{
-							Endpoint: "http://localhost:4317",
-							Protocol: "grpc",
-							Insecure: func() *bool {
-								insecure := false
-								return &insecure
+						BackendRef: gwapiv1.BackendObjectReference{
+							Name: "trace-collector",
+							Namespace: func() *gwapiv1.Namespace {
+								var ns gwapiv1.Namespace = "observability"
+								return &ns
+							}(),
+							Port: func() *gwapiv1.PortNumber {
+								var port gwapiv1.PortNumber = 4317
+								return &port
+							}(),
+						},
+						Protocol: "grpc",
+					},
+				},
+			},
+		},
+		{
+			caseName: "enable-tracing-with-custom-domain",
+			rateLimit: &egv1a1.RateLimit{
+				Backend: egv1a1.RateLimitDatabaseBackend{
+					Type: egv1a1.RedisBackendType,
+					Redis: &egv1a1.RateLimitRedisSettings{
+						URL: "redis.redis.svc:6379",
+					},
+				},
+				Telemetry: &egv1a1.RateLimitTelemetry{
+					Tracing: &egv1a1.RateLimitTracing{
+						SampleRate: func() *uint32 {
+							var rate uint32 = 55
+							return &rate
+						}(),
+						BackendRef: gwapiv1.BackendObjectReference{
+							Name: "trace-collector",
+							Namespace: func() *gwapiv1.Namespace {
+								var ns gwapiv1.Namespace = "observability"
+								return &ns
 							}(),
-							Timeout: "10s",
 						},
+						ClusterDomain: "example.local",
 					},
 				},
 			},
@@ -710,7 +738,7 @@ func TestDeployment(t *testing.T) {
 			dp, err := r.Deployment()
 			require.NoError(t, err)
 
-			if *overrideTestData {
+			if true {
 				deploymentYAML, err := yaml.Marshal(dp)
 				require.NoError(t, err)
 				// nolint:gosec