api: RequestCost configurations (#103)

This adds the RequestCost field to AIGatewayRoute, which will allows users to do the rate limiting etc based on the calculated "token usage". This is based on the new feature introduced in * envoyproxy/gateway#4957 * envoyproxy/gateway#5035 and because of the feature, the only thing we have to do from AI Gateway side is to set a dynamic metadata as per the comment in the API. --------- Signed-off-by: Takeshi Yoneda <t.y.mathetake@gmail.com>
envoyproxy · Jan 18, 2025 · f4ba5cc · f4ba5cc
1 parent 1840371
commit f4ba5cc
Show file tree

Hide file tree

Showing 23 changed files with 447 additions and 85 deletions.
diff --git a/api/v1alpha1/api.go b/api/v1alpha1/api.go
@@ -80,6 +80,99 @@ type AIGatewayRouteSpec struct {
 	// Currently, the filter is only implemented as an external process filter, which might be
 	// extended to other types of filters in the future. See https://github.com/envoyproxy/ai-gateway/issues/90
 	FilterConfig *AIGatewayFilterConfig `json:"filterConfig,omitempty"`
+
+	// LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.
+	// The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic
+	// metadata per HTTP request. The namespaced key is "io.envoy.ai_gateway",
+	//
+	// For example, let's say we have the following LLMRequestCosts configuration:
+	//
+	//	llmRequestCosts:
+	//	- metadataKey: llm_input_token
+	//	  type: InputToken
+	//	- metadataKey: llm_output_token
+	//	  type: OutputToken
+	//	- metadataKey: llm_total_token
+	//	  type: TotalToken
+	//
+	// Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
+	// rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
+	// the other is for the output token, and the last one is for the total token.
+	// Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.
+	//
+	//	apiVersion: gateway.envoyproxy.io/v1alpha1
+	//	kind: BackendTrafficPolicy
+	//	metadata:
+	//	  name: some-example-token-rate-limit
+	//	  namespace: default
+	//	spec:
+	//	  targetRefs:
+	//	  - group: gateway.networking.k8s.io
+	//	     kind: HTTPRoute
+	//	     name: usage-rate-limit
+	//	  rateLimit:
+	//	    type: Global
+	//	    global:
+	//	      rules:
+	//	        - clientSelectors:
+	//	            # Do the rate limiting based on the x-user-id header.
+	//	            - headers:
+	//	                - name: x-user-id
+	//	                  type: Distinct
+	//	          limit:
+	//	            # Configures the number of "tokens" allowed per hour.
+	//	            requests: 10000
+	//	            unit: Hour
+	//	          cost:
+	//	            request:
+	//	              from: Number
+	//	              # Setting the request cost to zero allows to only check the rate limit budget,
+	//	              # and not consume the budget on the request path.
+	//	              number: 0
+	//	            # This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.
+	//	            # The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited
+	//	            # if the budget is exhausted.
+	//	            response:
+	//	              from: Metadata
+	//	              metadata:
+	//	                namespace: io.envoy.ai_gateway
+	//	                key: llm_input_token
+	//	        - clientSelectors:
+	//	            - headers:
+	//	                - name: x-user-id
+	//	                  type: Distinct
+	//	          limit:
+	//	            requests: 10000
+	//	            unit: Hour
+	//	          cost:
+	//	            request:
+	//	              from: Number
+	//	              number: 0
+	//	            response:
+	//	              from: Metadata
+	//	              metadata:
+	//	                namespace: io.envoy.ai_gateway
+	//	                key: llm_output_token
+	//	        - clientSelectors:
+	//	            - headers:
+	//	                - name: x-user-id
+	//	                  type: Distinct
+	//	          limit:
+	//	            requests: 10000
+	//	            unit: Hour
+	//	          cost:
+	//	            request:
+	//	              from: Number
+	//	              number: 0
+	//	            response:
+	//	              from: Metadata
+	//	              metadata:
+	//	                namespace: io.envoy.ai_gateway
+	//	                key: llm_total_token
+	//
+	// +optional
+	// +kubebuilder:validation:MaxItems=36
+	LLMRequestCosts []LLMRequestCost `json:"llmRequestCosts,omitempty"`
 }
 
 // AIGatewayRouteRule is a rule that defines the routing behavior of the AIGatewayRoute.
@@ -230,6 +323,9 @@ type AIServiceBackendSpec struct {
 	//
 	// +optional
 	BackendSecurityPolicyRef *gwapiv1.LocalObjectReference `json:"backendSecurityPolicyRef,omitempty"`
+
+	// TODO: maybe add backend-level LLMRequestCost configuration that overrides the AIGatewayRoute-level LLMRequestCost.
+	// 	That may be useful for the backend that has a different cost calculation logic.
 }
 
 // VersionedAPISchema defines the API schema of either AIGatewayRoute (the input) or AIServiceBackend (the output).
@@ -378,3 +474,42 @@ type AWSOIDCExchangeToken struct {
 	// which maps to the temporary AWS security credentials exchanged using the authentication token issued by OIDC provider.
 	AwsRoleArn string `json:"awsRoleArn"`
 }
+
+// LLMRequestCost configures each request cost.
+type LLMRequestCost struct {
+	// MetadataKey is the key of the metadata to store this cost of the request.
+	//
+	// +kubebuilder:validation:Required
+	MetadataKey string `json:"metadataKey"`
+	// Type specifies the type of the request cost. The default is "OutputToken",
+	// and it uses "output token" as the cost. The other types are "InputToken" and "TotalToken".
+	//
+	// +kubebuilder:validation:Enum=OutputToken;InputToken;TotalToken
+	Type LLMRequestCostType `json:"type"`
+	// CELExpression is the CEL expression to calculate the cost of the request.
+	// The CEL expression must return an integer value. The CEL expression should be
+	// able to access the request headers, model name, backend name, input/output tokens etc.
+	//
+	// +optional
+	// +notImplementedHide https://github.com/envoyproxy/ai-gateway/issues/97
+	CELExpression *string `json:"celExpression"`
+}
+
+// LLMRequestCostType specifies the type of the LLMRequestCost.
+type LLMRequestCostType string
+
+const (
+	// LLMRequestCostTypeInputToken is the cost type of the input token.
+	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
+	// LLMRequestCostTypeOutputToken is the cost type of the output token.
+	LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
+	// LLMRequestCostTypeTotalToken is the cost type of the total token.
+	LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken"
+	// LLMRequestCostTypeCEL is for calculating the cost using the CEL expression.
+	LLMRequestCostTypeCEL LLMRequestCostType = "CEL"
+)
+
+const (
+	// AIGatewayFilterMetadataNamespace is the namespace for the ai-gateway filter metadata.
+	AIGatewayFilterMetadataNamespace = "io.envoy.ai_gateway"
+)
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/filterconfig/filterconfig.go b/filterconfig/filterconfig.go
@@ -33,7 +33,7 @@ modelNameHeaderKey: x-envoy-ai-gateway-model
 //	  name: OpenAI
 //	selectedBackendHeaderKey: x-envoy-ai-gateway-selected-backend
 //	modelNameHeaderKey: x-envoy-ai-gateway-model
-//	tokenUsageMetadata:
+//	llmRequestCost:
 //	  namespace: ai_gateway_llm_ns
 //	  key: token_usage_key
 //	rules:
@@ -66,11 +66,12 @@ modelNameHeaderKey: x-envoy-ai-gateway-model
 // From Envoy configuration perspective, configuring the header matching based on `x-envoy-ai-gateway-selected-backend` is enough to route the request to the selected backend.
 // That is because the matching decision is made by the filter and the selected backend is populated in the header `x-envoy-ai-gateway-selected-backend`.
 type Config struct {
-	// TokenUsageMetadata is the namespace and key to be used in the filter metadata to store the usage token, optional.
-	// If this is provided, the filter will populate the usage token in the filter metadata at the end of the
-	// response body processing.
-	TokenUsageMetadata *TokenUsageMetadata `yaml:"tokenUsageMetadata,omitempty"`
-	// Schema specifies the API schema of the input format of requests to the filter.
+	// MetadataNamespace is the namespace of the dynamic metadata to be used by the filter.
+	MetadataNamespace string `yaml:"namespace"`
+	// LLMRequestCost configures the cost of each LLM-related request. Optional. If this is provided, the filter will populate
+	// the "calculated" cost in the filter metadata at the end of the response body processing.
+	LLMRequestCosts []LLMRequestCost `yaml:"llmRequestCosts,omitempty"`
+	// InputSchema specifies the API schema of the input format of requests to the filter.
 	Schema VersionedAPISchema `yaml:"schema"`
 	// ModelNameHeaderKey is the header key to be populated with the model name by the filter.
 	ModelNameHeaderKey string `yaml:"modelNameHeaderKey"`
@@ -82,18 +83,37 @@ type Config struct {
 	Rules []RouteRule `yaml:"rules"`
 }
 
-// TokenUsageMetadata is the namespace and key to be used in the filter metadata to store the usage token.
+// LLMRequestCost specifies "where" the request cost is stored in the filter metadata as well as
+// "how" the cost is calculated. By default, the cost is retrieved from "output token" in the response body.
+//
 // This can be used to subtract the usage token from the usage quota in the rate limit filter when
 // the request completes combined with `apply_on_stream_done` and `hits_addend` fields of
 // the rate limit configuration https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#config-route-v3-ratelimit
 // which is introduced in Envoy 1.33 (to be released soon as of writing).
-type TokenUsageMetadata struct {
-	// Namespace is the namespace of the metadata.
-	Namespace string `yaml:"namespace"`
-	// Key is the key of the metadata.
-	Key string `yaml:"key"`
+type LLMRequestCost struct {
+	// MetadataKey is the key of the metadata storing the request cost.
+	MetadataKey string `yaml:"key"`
+	// Type is the kind of the request cost calculation.
+	Type LLMRequestCostType `yaml:"type"`
+	// CELExpression is the CEL expression to calculate the cost of the request.
+	// This is not empty when the Type is LLMRequestCostTypeCELExpression.
+	CELExpression string `yaml:"celExpression,omitempty"`
 }
 
+// LLMRequestCostType specifies the kind of the request cost calculation.
+type LLMRequestCostType string
+
+const (
+	// LLMRequestCostTypeOutputToken specifies that the request cost is calculated from the output token.
+	LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
+	// LLMRequestCostTypeInputToken specifies that the request cost is calculated from the input token.
+	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
+	// LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token.
+	LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken"
+	// LLMRequestCostTypeCELExpression specifies that the request cost is calculated from the CEL expression.
+	LLMRequestCostTypeCELExpression LLMRequestCostType = "CEL"
+)
+
 // VersionedAPISchema corresponds to LLMAPISchema in api/v1alpha1/api.go.
 type VersionedAPISchema struct {
 	// Name is the name of the API schema.

diff --git a/filterconfig/filterconfig_test.go b/filterconfig/filterconfig_test.go
@@ -33,9 +33,10 @@ schema:
   name: OpenAI
 selectedBackendHeaderKey: x-envoy-ai-gateway-selected-backend
 modelNameHeaderKey: x-envoy-ai-gateway-model
-tokenUsageMetadata:
-  namespace: ai_gateway_llm_ns
-  key: token_usage_key
+metadataNamespace: ai_gateway_llm_ns
+llmRequestCosts:
+- metadataKey: token_usage_key
+  type: OutputToken
 rules:
 - backends:
   - name: kserve
@@ -60,8 +61,9 @@ rules:
 	require.NoError(t, os.WriteFile(configPath, []byte(config), 0o600))
 	cfg, err := filterconfig.UnmarshalConfigYaml(configPath)
 	require.NoError(t, err)
-	require.Equal(t, "ai_gateway_llm_ns", cfg.TokenUsageMetadata.Namespace)
-	require.Equal(t, "token_usage_key", cfg.TokenUsageMetadata.Key)
+	require.Equal(t, "ai_gateway_llm_ns", cfg.MetadataNamespace)
+	require.Equal(t, "token_usage_key", cfg.LLMRequestCosts[0].MetadataKey)
+	require.Equal(t, "OutputToken", string(cfg.LLMRequestCosts[0].Type))
 	require.Equal(t, "OpenAI", string(cfg.Schema.Name))
 	require.Equal(t, "x-envoy-ai-gateway-selected-backend", cfg.SelectedBackendHeaderKey)
 	require.Equal(t, "x-envoy-ai-gateway-model", cfg.ModelNameHeaderKey)

diff --git a/internal/controller/ai_gateway_route.go b/internal/controller/ai_gateway_route.go
@@ -141,6 +141,9 @@ func (c *aiGatewayRouteController) reconcileExtProcExtensionPolicy(ctx context.C
 						Port:      &port,
 					},
 				}}},
+				Metadata: &egv1a1.ExtProcMetadata{
+					WritableNamespaces: []string{aigv1a1.AIGatewayFilterMetadataNamespace},
+				},
 			}},
 		},
 	}

diff --git a/internal/controller/ai_gateway_route_test.go b/internal/controller/ai_gateway_route_test.go
@@ -128,6 +128,11 @@ func TestAIGatewayRouteController_reconcileExtProcExtensionPolicy(t *testing.T)
 	for i, target := range extPolicy.Spec.TargetRefs {
 		require.Equal(t, aiGatewayRoute.Spec.TargetRefs[i].Name, target.Name)
 	}
+	require.Equal(t, ownerRef, extPolicy.OwnerReferences)
+	require.Len(t, extPolicy.Spec.ExtProc, 1)
+	require.NotNil(t, extPolicy.Spec.ExtProc[0].Metadata)
+	require.NotEmpty(t, extPolicy.Spec.ExtProc[0].Metadata.WritableNamespaces)
+	require.Equal(t, aigv1a1.AIGatewayFilterMetadataNamespace, extPolicy.Spec.ExtProc[0].Metadata.WritableNamespaces[0])
 
 	// Update the policy.
 	aiGatewayRoute.Spec.TargetRefs = []gwapiv1a2.LocalPolicyTargetReferenceWithSectionName{

diff --git a/internal/controller/sink.go b/internal/controller/sink.go
@@ -191,6 +191,24 @@ func (c *configSink) updateExtProcConfigMap(aiGatewayRoute *aigv1a1.AIGatewayRou
 		}
 	}
 
+	ec.MetadataNamespace = aigv1a1.AIGatewayFilterMetadataNamespace
+	for _, cost := range aiGatewayRoute.Spec.LLMRequestCosts {
+		fc := filterconfig.LLMRequestCost{MetadataKey: cost.MetadataKey}
+		switch cost.Type {
+		case aigv1a1.LLMRequestCostTypeInputToken:
+			fc.Type = filterconfig.LLMRequestCostTypeInputToken
+		case aigv1a1.LLMRequestCostTypeOutputToken:
+			fc.Type = filterconfig.LLMRequestCostTypeOutputToken
+		case aigv1a1.LLMRequestCostTypeTotalToken:
+			fc.Type = filterconfig.LLMRequestCostTypeTotalToken
+		case aigv1a1.LLMRequestCostTypeCEL:
+			fc.Type = filterconfig.LLMRequestCostTypeCELExpression
+		default:
+			return fmt.Errorf("unknown request cost type: %s", cost.Type)
+		}
+		ec.LLMRequestCosts = append(ec.LLMRequestCosts, fc)
+	}
+
 	marshaled, err := yaml.Marshal(ec)
 	if err != nil {
 		return fmt.Errorf("failed to marshal extproc config: %w", err)

diff --git a/internal/controller/sink_test.go b/internal/controller/sink_test.go
@@ -267,11 +267,22 @@ func Test_updateExtProcConfigMap(t *testing.T) {
 							},
 						},
 					},
+					LLMRequestCosts: []aigv1a1.LLMRequestCost{
+						{
+							Type:        aigv1a1.LLMRequestCostTypeOutputToken,
+							MetadataKey: "output-token",
+						},
+						{
+							Type:        aigv1a1.LLMRequestCostTypeInputToken,
+							MetadataKey: "input-token",
+						},
+					},
 				},
 			},
 			exp: &filterconfig.Config{
 				Schema:                   filterconfig.VersionedAPISchema{Name: filterconfig.APISchemaOpenAI, Version: "v123"},
 				ModelNameHeaderKey:       aigv1a1.AIModelHeaderKey,
+				MetadataNamespace:        aigv1a1.AIGatewayFilterMetadataNamespace,
 				SelectedBackendHeaderKey: selectedBackendHeaderKey,
 				Rules: []filterconfig.RouteRule{
 					{
@@ -285,6 +296,10 @@ func Test_updateExtProcConfigMap(t *testing.T) {
 						Headers:  []filterconfig.HeaderMatch{{Name: aigv1a1.AIModelHeaderKey, Value: "another-ai"}},
 					},
 				},
+				LLMRequestCosts: []filterconfig.LLMRequestCost{
+					{Type: filterconfig.LLMRequestCostTypeOutputToken, MetadataKey: "output-token"},
+					{Type: filterconfig.LLMRequestCostTypeInputToken, MetadataKey: "input-token"},
+				},
 			},
 		},
 	} {

diff --git a/internal/extproc/mocks_test.go b/internal/extproc/mocks_test.go
@@ -70,7 +70,7 @@ type mockTranslator struct {
 	retHeaderMutation *extprocv3.HeaderMutation
 	retBodyMutation   *extprocv3.BodyMutation
 	retOverride       *extprocv3http.ProcessingMode
-	retUsedToken      uint32
+	retUsedToken      translator.LLMTokenUsage
 	retErr            error
 }
 
@@ -87,7 +87,7 @@ func (m mockTranslator) ResponseHeaders(headers map[string]string) (headerMutati
 }
 
 // ResponseBody implements [translator.Translator.ResponseBody].
-func (m mockTranslator) ResponseBody(body io.Reader, _ bool) (headerMutation *extprocv3.HeaderMutation, bodyMutation *extprocv3.BodyMutation, usedToken uint32, err error) {
+func (m mockTranslator) ResponseBody(body io.Reader, _ bool) (headerMutation *extprocv3.HeaderMutation, bodyMutation *extprocv3.BodyMutation, tokenUsage translator.LLMTokenUsage, err error) {
 	if m.expResponseBody != nil {
 		buf, err := io.ReadAll(body)
 		require.NoError(m.t, err)
-Original file line number
+Diff line change
@@ Expand Up @@
     						Port:      &port,
     					},
     				}}},
+    				Metadata: &egv1a1.ExtProcMetadata{
+    					WritableNamespaces: []string{aigv1a1.AIGatewayFilterMetadataNamespace},
+    				},
     			}},
     		},
     	}
@@ Expand Down @@