Skip to content

Commit

Permalink
stdserver: Improve health check ergonomics
Browse files Browse the repository at this point in the history
This change replaces the health check's context with a configurable timeout,
instead of using the request context. Error messages around context timeouts
are provided to improve operator ergonomics.

(cherry picked from commit 5a4ca94)
  • Loading branch information
bobvawter committed Oct 28, 2024
1 parent 88540c8 commit 5c71db3
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 12 deletions.
7 changes: 5 additions & 2 deletions internal/source/cdc/server/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,12 @@ func ProvideListener(
// ProvideMux is called by Wire to construct the http.ServeMux that
// routes requests.
func ProvideMux(
handler *cdc.Handler, stagingPool *types.StagingPool, targetPool *types.TargetPool,
config *Config,
handler *cdc.Handler,
stagingPool *types.StagingPool,
targetPool *types.TargetPool,
) *http.ServeMux {
return stdserver.Mux(handler, stagingPool, targetPool)
return stdserver.Mux(&config.HTTP, handler, stagingPool, targetPool)
}

// ProvideServer is called by Wire to construct the top-level network
Expand Down
4 changes: 2 additions & 2 deletions internal/source/cdc/server/wire_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions internal/util/stdserver/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,20 @@
package stdserver

import (
"time"

"github.com/pkg/errors"
"github.com/spf13/pflag"
)

const defaultHealthCheckTimeout = 5 * time.Second

// Config contains the user-visible configuration for running an http server.
type Config struct {
BindAddr string
DisableAuth bool
GenerateSelfSigned bool
HealthCheckTimeout time.Duration
TLSCertFile string
TLSPrivateKey string
}
Expand All @@ -47,6 +52,11 @@ func (c *Config) Bind(flags *pflag.FlagSet) {
"tlsSelfSigned",
false,
"if true, generate a self-signed TLS certificate valid for 'localhost'")
flags.DurationVar(
&c.HealthCheckTimeout,
"healthCheckTimeout",
defaultHealthCheckTimeout,
"the timeout for the health check endpoint")
flags.StringVar(
&c.TLSCertFile,
"tlsCertificate",
Expand All @@ -70,5 +80,8 @@ func (c *Config) Preflight() error {
if c.GenerateSelfSigned && c.TLSCertFile != "" {
return errors.New("self-signed certificate requested, but also specified a TLS certificate")
}
if c.HealthCheckTimeout <= 0 {
c.HealthCheckTimeout = defaultHealthCheckTimeout
}
return nil
}
64 changes: 64 additions & 0 deletions internal/util/stdserver/healthz_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// Copyright 2024 The Cockroach Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// SPDX-License-Identifier: Apache-2.0

package stdserver

import (
"net/http"
"net/http/httptest"
"testing"

"github.com/cockroachdb/replicator/internal/sinktest/base"
"github.com/stretchr/testify/require"
)

func TestHealthzTimeout(t *testing.T) {
r := require.New(t)

fixture, err := base.NewFixture(t)
r.NoError(err)

cfg := &Config{
HealthCheckTimeout: -1, // Cancel the context immediately.
}

mux := Mux(
cfg,
http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {}),
fixture.StagingPool,
fixture.TargetPool,
)

req := httptest.NewRequest("GET", healthCheckPath, nil /* body */)

handler, _ := mux.Handler(req)
r.NotNil(handler)

w := httptest.NewRecorder()
handler.ServeHTTP(w, req)
r.Equal(http.StatusRequestTimeout, w.Code)
r.Contains(w.Body.String(), "timed out")

// Break the staging pool and expect a closed-pool error.
fixture.StagingPool.Close()
// Allow context to function.
cfg.HealthCheckTimeout = defaultHealthCheckTimeout

w = httptest.NewRecorder()
handler.ServeHTTP(w, req)
r.Equal(http.StatusInternalServerError, w.Code)
r.Contains(w.Body.String(), "closed pool")
}
37 changes: 29 additions & 8 deletions internal/util/stdserver/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
package stdserver

import (
"context"
"crypto/tls"
"fmt"
"net"
"net/http"

Expand All @@ -33,6 +35,11 @@ import (
"golang.org/x/net/http2/h2c"
)

const healthCheckPath = "/_/healthz"

// errHealthCheckTimeout is a causal error used by [Mux].
var errHealthCheckTimeout = errors.New("health check timed out")

// A Server receives incoming messages and
// applies them to a target cluster.
type Server struct {
Expand Down Expand Up @@ -112,18 +119,32 @@ func New(

// Mux constructs the http.ServeMux that routes requests.
func Mux(
handler http.Handler, stagingPool *types.StagingPool, targetPool *types.TargetPool,
cfg *Config, handler http.Handler, stagingPool *types.StagingPool, targetPool *types.TargetPool,
) *http.ServeMux {
mux := &http.ServeMux{}
mux.HandleFunc("/_/healthz", func(w http.ResponseWriter, r *http.Request) {
if err := stagingPool.Ping(r.Context()); err != nil {
log.WithError(err).Warn("health check failed for staging pool")
http.Error(w, "health check failed for staging", http.StatusInternalServerError)
mux.HandleFunc(healthCheckPath, func(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeoutCause(context.Background(),
cfg.HealthCheckTimeout, errHealthCheckTimeout)
defer cancel()

setError := func(err error, name string) {
if errors.Is(context.Cause(ctx), errHealthCheckTimeout) {
msg := fmt.Sprintf("health check for %s pool timed out", name)
log.Warn(msg)
http.Error(w, msg, http.StatusRequestTimeout)
return
}
msg := fmt.Sprintf("health check fail for %s pool: %v", name, err)
log.WithError(err).Warn(msg)
http.Error(w, msg, http.StatusInternalServerError)
}

if err := stagingPool.Ping(ctx); err != nil {
setError(err, "staging")
return
}
if err := targetPool.PingContext(r.Context()); err != nil {
log.WithError(err).Warn("health check failed for target pool")
http.Error(w, "health check failed for target", http.StatusInternalServerError)
if err := targetPool.PingContext(ctx); err != nil {
setError(err, "target")
return
}
http.Error(w, "OK", http.StatusOK)
Expand Down

0 comments on commit 5c71db3

Please sign in to comment.