Skip to content

Commit

Permalink
fix(daemon): improve health state lock test, remove LockCount (#373)
Browse files Browse the repository at this point in the history
As Harry pointed out at
#369 (comment),
there's a much simpler way to test this without needing a new State
method like LockCount. Just acquire the state lock, then call the
endpoint. If it times out, we know it was trying to acquire the lock.

In addition, fix an issue where the health endpoint would still hold the
state lock if it returned an error. Fix those and add a test for that
too.
  • Loading branch information
benhoyt authored Mar 28, 2024
1 parent 0d00024 commit 345d13c
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 39 deletions.
13 changes: 11 additions & 2 deletions internals/daemon/api_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ func v1Health(c *Command, r *http.Request, _ *UserState) Response {
switch level {
case plan.UnsetLevel, plan.AliveLevel, plan.ReadyLevel:
default:
return BadRequest(`level must be "alive" or "ready"`)
return healthError(http.StatusBadRequest, `level must be "alive" or "ready"`)
}

names := strutil.MultiCommaSeparatedList(query["names"])

checks, err := getChecks(c.d.overlord)
if err != nil {
logger.Noticef("Cannot fetch checks: %v", err.Error())
return InternalError("internal server error")
return healthError(http.StatusInternalServerError, "internal server error")
}

healthy := true
Expand Down Expand Up @@ -89,3 +89,12 @@ func (r *healthResp) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(status)
w.Write(bs)
}

func healthError(status int, message string) *healthResp {
return &healthResp{
Type: ResponseTypeError,
Status: status,
StatusText: http.StatusText(status),
Result: &errorResult{Message: message},
}
}
73 changes: 46 additions & 27 deletions internals/daemon/api_health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package daemon
import (
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/http/httptest"
Expand Down Expand Up @@ -213,7 +214,15 @@ func (s *healthSuite) TestChecksError(c *C) {
//
// - https://github.com/canonical/pebble/issues/366
// - https://bugs.launchpad.net/juju/+bug/2052517
func (s *apiSuite) TestStateLockNotHeld(c *C) {
func (s *apiSuite) TestHealthStateLockNotHeldSuccess(c *C) {
s.testHealthStateLockNotHeld(c, "", false)
}

func (s *apiSuite) TestHealthStateLockNotHeldError(c *C) {
s.testHealthStateLockNotHeld(c, "badlevel", true)
}

func (s *apiSuite) testHealthStateLockNotHeld(c *C, level string, expectErr bool) {
daemonOpts := &Options{
Dir: s.pebbleDir,
SocketPath: s.pebbleDir + ".pebble.socket",
Expand All @@ -226,35 +235,45 @@ func (s *apiSuite) TestStateLockNotHeld(c *C) {
c.Assert(daemon.Stop(nil), IsNil)
}()

// Wait for lock count to stabilise (daemon starts goroutine(s) that use
// it on startup). Normally it will stabilise almost instantly.
prevCount := daemon.state.LockCount()
for i := 0; i < 50; i++ {
if i == 49 {
c.Fatal("timed out waiting for lock count to stabilise")
// Acquire state lock so that the health endpoint can't.
daemon.state.Lock()
defer daemon.state.Unlock()

// Call health check endpoint in a goroutine so we can have a timeout.
errCh := make(chan error)
go func() (err error) {
defer func() {
errCh <- err
}()

// Use real HTTP client so we're exercising the full ServeHTTP flow.
// Could use daemon.serve.Handler directly, but this seems even better.
pebble, err := client.New(&client.Config{Socket: daemonOpts.SocketPath})
if err != nil {
return err
}
time.Sleep(100 * time.Millisecond)
newCount := daemon.state.LockCount()
if newCount == prevCount {
break
healthy, err := pebble.Health(&client.HealthOptions{
Level: client.CheckLevel(level),
})
if err != nil {
return err
}
prevCount = newCount
}

// Use real HTTP client so we're exercising the full ServeHTTP flow.
// Could use daemon.serve.Handler directly, but this seems even better.
pebble, err := client.New(&client.Config{
Socket: daemonOpts.SocketPath,
})
c.Assert(err, IsNil)
if !healthy {
return fmt.Errorf("/v1/health returned false")
}
return nil
}()

// Ensure lock count doesn't change during GET /v1/health request.
before := daemon.state.LockCount()
healthy, err := pebble.Health(&client.HealthOptions{})
c.Assert(err, IsNil)
c.Assert(healthy, Equals, true)
after := daemon.state.LockCount()
c.Assert(after, Equals, before)
select {
case healthErr := <-errCh:
if expectErr {
c.Assert(healthErr, NotNil)
} else {
c.Assert(healthErr, IsNil)
}
case <-time.After(5 * time.Second):
c.Fatalf("timed out waiting for /v1/health - it must be trying to acquire the state lock")
}
}

func serveHealth(c *C, method, url string, body io.Reader) (int, map[string]interface{}) {
Expand Down
10 changes: 0 additions & 10 deletions internals/overlord/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,6 @@ type State struct {
mu sync.Mutex
muC int32

lockCount int32 // used only for testing

lastTaskId int
lastChangeId int
lastLaneId int
Expand Down Expand Up @@ -137,14 +135,6 @@ func (s *State) Modified() bool {
func (s *State) Lock() {
s.mu.Lock()
atomic.AddInt32(&s.muC, 1)
atomic.AddInt32(&s.lockCount, 1)
}

// LockCount returns the number of times the state lock was held.
//
// NOTE: This needs to be exported, but should only be used in testing.
func (s *State) LockCount() int {
return int(atomic.LoadInt32(&s.lockCount))
}

func (s *State) reading() {
Expand Down

0 comments on commit 345d13c

Please sign in to comment.