Skip to content

Commit

Permalink
feat: Gracefully drain listeners before envoy shutdown on pod termina…
Browse files Browse the repository at this point in the history
…tion (#2633)

* feat: Gracefully drain listeners before envoy shutdown on pod termination

Signed-off-by: David Alger <davidmalger@gmail.com>

* Setup hooks to manage graceful Envoy shutdown process

Signed-off-by: David Alger <davidmalger@gmail.com>

* Implement graceful drain process in shutdown-manager

Signed-off-by: David Alger <davidmalger@gmail.com>

* Send logs from exec prestop hook to stdout of main process

Signed-off-by: David Alger <davidmalger@gmail.com>

* Make linter happy

Signed-off-by: David Alger <davidmalger@gmail.com>

* Minor cleanup

Signed-off-by: David Alger <davidmalger@gmail.com>

* Stop polling when ready-timeout exceeded

Signed-off-by: David Alger <davidmalger@gmail.com>

* Container configuration for shutdown manager

Signed-off-by: David Alger <davidmalger@gmail.com>

* Setup health probes

Signed-off-by: David Alger <davidmalger@gmail.com>

* Configurable shutdown timeouts

Signed-off-by: David Alger <davidmalger@gmail.com>

* Correct exitAtConnections logic

Signed-off-by: David Alger <davidmalger@gmail.com>

* Lower shutdown timeouts for conformance tests

Signed-off-by: David Alger <davidmalger@gmail.com>

* Integrate with latest from main

Signed-off-by: David Alger <davidmalger@gmail.com>

* Describe node used in test runs

Signed-off-by: David Alger <davidmalger@gmail.com>

* Use TAG=latest for conformance tests

Signed-off-by: David Alger <davidmalger@gmail.com>

* Update shutdown/ready logic and misc cleanup

Signed-off-by: David Alger <davidmalger@gmail.com>

* Shutdown manager config test

Signed-off-by: David Alger <davidmalger@gmail.com>

* Test coverage FileLogger

Signed-off-by: David Alger <davidmalger@gmail.com>

* Require use of patch field to override config on shutdown-manager container

Signed-off-by: David Alger <davidmalger@gmail.com>

* Update docs

Signed-off-by: David Alger <davidmalger@gmail.com>

* Remove knob for ExitAtConnections

Signed-off-by: David Alger <davidmalger@gmail.com>

* Pass image version for shutdown-manager in from build

Signed-off-by: David Alger <davidmalger@gmail.com>

* Fix generated content

Signed-off-by: David Alger <davidmalger@gmail.com>

* Recombine image consts

Signed-off-by: David Alger <davidmalger@gmail.com>

* Lower default min-drain-duration to 5 seconds

Signed-off-by: David Alger <davidmalger@gmail.com>

* Update generated docs

Signed-off-by: David Alger <davidmalger@gmail.com>

* Fail health checks to support fast failure when active health checking

Signed-off-by: David Alger <davidmalger@gmail.com>

* Update tests

Signed-off-by: David Alger <davidmalger@gmail.com>

* Move drain type config to correct place

Signed-off-by: David Alger <davidmalger@gmail.com>

---------

Signed-off-by: David Alger <davidmalger@gmail.com>
  • Loading branch information
davidalger authored Feb 27, 2024
1 parent 72fadb7 commit 329aafc
Show file tree
Hide file tree
Showing 123 changed files with 2,002 additions and 58 deletions.
11 changes: 11 additions & 0 deletions api/v1alpha1/envoyproxy_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (

autoscalingv2 "k8s.io/api/autoscaling/v2"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/utils/ptr"
)

Expand Down Expand Up @@ -120,3 +121,13 @@ func (logging *ProxyLogging) GetEnvoyProxyComponentLevel() string {

return strings.Join(args, ",")
}

// DefaultShutdownManagerContainerResourceRequirements returns a new ResourceRequirements with default settings.
func DefaultShutdownManagerContainerResourceRequirements() *v1.ResourceRequirements {
return &v1.ResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceCPU: resource.MustParse(DefaultShutdownManagerCPUResourceRequests),
v1.ResourceMemory: resource.MustParse(DefaultShutdownManagerMemoryResourceRequests),
},
}
}
19 changes: 19 additions & 0 deletions api/v1alpha1/envoyproxy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ type EnvoyProxySpec struct {
//
// +optional
MergeGateways *bool `json:"mergeGateways,omitempty"`

// Shutdown defines configuration for graceful envoy shutdown process.
//
// +optional
Shutdown *ShutdownConfig `json:"shutdown,omitempty"`
}

type ProxyTelemetry struct {
Expand Down Expand Up @@ -115,6 +120,20 @@ type EnvoyProxyProvider struct {
Kubernetes *EnvoyProxyKubernetesProvider `json:"kubernetes,omitempty"`
}

// ShutdownConfig defines configuration for graceful envoy shutdown process.
type ShutdownConfig struct {
// DrainTimeout defines the graceful drain timeout. This should be less than the pod's terminationGracePeriodSeconds.
// If unspecified, defaults to 600 seconds.
//
// +optional
DrainTimeout *metav1.Duration `json:"drainTimeout,omitempty"`
// MinDrainDuration defines the minimum drain duration allowing time for endpoint deprogramming to complete.
// If unspecified, defaults to 5 seconds.
//
// +optional
MinDrainDuration *metav1.Duration `json:"minDrainDuration,omitempty"`
}

// EnvoyProxyKubernetesProvider defines configuration for the Kubernetes resource
// provider.
type EnvoyProxyKubernetesProvider struct {
Expand Down
6 changes: 6 additions & 0 deletions api/v1alpha1/shared_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ const (
DefaultDeploymentMemoryResourceRequests = "512Mi"
// DefaultEnvoyProxyImage is the default image used by envoyproxy
DefaultEnvoyProxyImage = "envoyproxy/envoy:distroless-dev"
// DefaultShutdownManagerCPUResourceRequests for shutdown manager cpu resource
DefaultShutdownManagerCPUResourceRequests = "10m"
// DefaultShutdownManagerMemoryResourceRequests for shutdown manager memory resource
DefaultShutdownManagerMemoryResourceRequests = "32Mi"
// DefaultShutdownManagerImage is the default image used for the shutdown manager.
DefaultShutdownManagerImage = "envoyproxy/gateway-dev:latest"
// DefaultRateLimitImage is the default image used by ratelimit.
DefaultRateLimitImage = "envoyproxy/ratelimit:master"
// HTTPProtocol is the common-used http protocol.
Expand Down
30 changes: 30 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -6631,6 +6631,21 @@ spec:
required:
- type
type: object
shutdown:
description: Shutdown defines configuration for graceful envoy shutdown
process.
properties:
drainTimeout:
description: DrainTimeout defines the graceful drain timeout.
This should be less than the pod's terminationGracePeriodSeconds.
If unspecified, defaults to 600 seconds.
type: string
minDrainDuration:
description: MinDrainDuration defines the minimum drain duration
allowing time for endpoint deprogramming to complete. If unspecified,
defaults to 5 seconds.
type: string
type: object
telemetry:
description: Telemetry defines telemetry parameters for managed proxies.
properties:
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ require (
go.opentelemetry.io/proto/otlp v1.1.0
go.uber.org/zap v1.26.0
golang.org/x/exp v0.0.0-20231006140011-7918f672742d
golang.org/x/sys v0.17.0
google.golang.org/grpc v1.61.1
google.golang.org/protobuf v1.32.0
gopkg.in/yaml.v3 v3.0.1
Expand Down Expand Up @@ -109,7 +110,6 @@ require (
golang.org/x/net v0.20.0 // indirect
golang.org/x/oauth2 v0.16.0 // indirect
golang.org/x/sync v0.6.0 // indirect
golang.org/x/sys v0.17.0 // indirect
golang.org/x/term v0.16.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,7 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: default/eg/http
perConnectionBufferLimitBytes: 32768
- activeState:
Expand Down Expand Up @@ -881,6 +882,7 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: default/eg/grpc
perConnectionBufferLimitBytes: 32768
- activeState:
Expand All @@ -903,6 +905,7 @@ xds:
socketAddress:
address: 0.0.0.0
portValue: 8443
drainType: MODIFY_ONLY
filterChains:
- filterChainMatch:
serverNames:
Expand Down Expand Up @@ -948,6 +951,7 @@ xds:
socketAddress:
address: 0.0.0.0
portValue: 1234
drainType: MODIFY_ONLY
filterChains:
- filters:
- name: envoy.filters.network.tcp_proxy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: default/eg/http
perConnectionBufferLimitBytes: 32768
- '@type': type.googleapis.com/envoy.admin.v3.RoutesConfigDump
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@
}
]
},
"drainType": "MODIFY_ONLY",
"name": "default/eg/http",
"perConnectionBufferLimitBytes": 32768
}
Expand Down Expand Up @@ -679,6 +680,7 @@
}
]
},
"drainType": "MODIFY_ONLY",
"name": "default/eg/grpc",
"perConnectionBufferLimitBytes": 32768
}
Expand Down Expand Up @@ -715,6 +717,7 @@
"portValue": 8443
}
},
"drainType": "MODIFY_ONLY",
"filterChains": [
{
"filterChainMatch": {
Expand Down Expand Up @@ -792,6 +795,7 @@
"portValue": 1234
}
},
"drainType": "MODIFY_ONLY",
"filterChains": [
{
"filters": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: default/eg/http
perConnectionBufferLimitBytes: 32768
- activeState:
Expand Down Expand Up @@ -407,6 +408,7 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: default/eg/grpc
perConnectionBufferLimitBytes: 32768
- activeState:
Expand All @@ -429,6 +431,7 @@ xds:
socketAddress:
address: 0.0.0.0
portValue: 8443
drainType: MODIFY_ONLY
filterChains:
- filterChainMatch:
serverNames:
Expand Down Expand Up @@ -474,6 +477,7 @@ xds:
socketAddress:
address: 0.0.0.0
portValue: 1234
drainType: MODIFY_ONLY
filterChains:
- filters:
- name: envoy.filters.network.tcp_proxy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: default/eg/http
perConnectionBufferLimitBytes: 32768
- activeState:
Expand Down Expand Up @@ -124,6 +125,7 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: default/eg/grpc
perConnectionBufferLimitBytes: 32768
- activeState:
Expand All @@ -146,6 +148,7 @@ xds:
socketAddress:
address: 0.0.0.0
portValue: 8443
drainType: MODIFY_ONLY
filterChains:
- filterChainMatch:
serverNames:
Expand Down Expand Up @@ -191,6 +194,7 @@ xds:
socketAddress:
address: 0.0.0.0
portValue: 1234
drainType: MODIFY_ONLY
filterChains:
- filters:
- name: envoy.filters.network.tcp_proxy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@
}
]
},
"drainType": "MODIFY_ONLY",
"name": "envoy-gateway-system/eg/http",
"perConnectionBufferLimitBytes": 32768
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: envoy-gateway-system/eg/http
perConnectionBufferLimitBytes: 32768
- '@type': type.googleapis.com/envoy.admin.v3.RoutesConfigDump
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,5 +75,6 @@ xds:
serverHeaderTransformation: PASS_THROUGH
statPrefix: http
useRemoteAddress: true
drainType: MODIFY_ONLY
name: envoy-gateway-system/eg/http
perConnectionBufferLimitBytes: 32768
71 changes: 71 additions & 0 deletions internal/cmd/envoy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright Envoy Gateway Authors
// SPDX-License-Identifier: Apache-2.0
// The full text of the Apache license is available in the LICENSE file at
// the root of the repo.

package cmd

import (
"time"

"github.com/spf13/cobra"

"github.com/envoyproxy/gateway/internal/cmd/envoy"
)

// getEnvoyCommand returns the envoy cobra command to be executed.
func getEnvoyCommand() *cobra.Command {
cmd := &cobra.Command{
Use: "envoy",
Short: "Envoy proxy management",
}

cmd.AddCommand(getShutdownCommand())
cmd.AddCommand(getShutdownManagerCommand())

return cmd
}

// getShutdownCommand returns the shutdown cobra command to be executed.
func getShutdownCommand() *cobra.Command {
var drainTimeout time.Duration
var minDrainDuration time.Duration
var exitAtConnections int

cmd := &cobra.Command{
Use: "shutdown",
Short: "Gracefully drain open connections prior to pod shutdown.",
RunE: func(cmd *cobra.Command, args []string) error {
return envoy.Shutdown(drainTimeout, minDrainDuration, exitAtConnections)
},
}

cmd.PersistentFlags().DurationVar(&drainTimeout, "drain-timeout", 600*time.Second,
"Graceful shutdown timeout. This should be less than the pod's terminationGracePeriodSeconds.")

cmd.PersistentFlags().DurationVar(&minDrainDuration, "min-drain-duration", 5*time.Second,
"Minimum drain duration allowing time for endpoint deprogramming to complete.")

cmd.PersistentFlags().IntVar(&exitAtConnections, "exit-at-connections", 0,
"Number of connections to wait for when monitoring Envoy listener drain process.")

return cmd
}

// getShutdownManagerCommand returns the shutdown manager cobra command to be executed.
func getShutdownManagerCommand() *cobra.Command {
var readyTimeout time.Duration

cmd := &cobra.Command{
Use: "shutdown-manager",
Short: "Provides HTTP endpoint used in preStop hook to block until ready for pod shutdown.",
RunE: func(cmd *cobra.Command, args []string) error {
return envoy.ShutdownManager(readyTimeout)
},
}

cmd.PersistentFlags().DurationVar(&readyTimeout, "ready-timeout", 610*time.Second,
"Shutdown ready timeout. This should be greater than shutdown's drain-timeout and less than the pod's terminationGracePeriodSeconds.")

return cmd
}
Loading

0 comments on commit 329aafc

Please sign in to comment.