Skip to content

Commit

Permalink
feat: add pull mount histogram metrics (#79)
Browse files Browse the repository at this point in the history
* feat: wip add histogram and counter metrics for pull and mount operations

* refactor: move registering metrics and starting metrics server to a separate fn
- so that it is easier to use in tests
- fix bug where `err` is used instead of `e`
- remove metric for mount and pull wait timeout (never really recorded because kubelet timeout exhausts before)

* test: wip test for metrics

* test: add e2e test
- hook it into cicd
- finish up unit tests
- fix nil ptr error for async errors
-

* refactor: remove debug statement
- not needed anymore

* fix: metrics test gha step name for

* chore: rename metrics test job `error-compatible-ephemeral-volume` -> `error-ephemeral-volume`

* debug: add ssh action

* fix: ci/cd test failing
- because the script assumes warm metal driver is deployed in `default` namespace (it is deployed in `kube-system` namespace)

* docs: fix typo in code comments

* chore: bump release version to `v0.9.0`
  • Loading branch information
vadasambar authored Dec 20, 2023
1 parent 6061d3a commit 8c0cf34
Show file tree
Hide file tree
Showing 16 changed files with 429 additions and 5 deletions.
37 changes: 37 additions & 0 deletions .github/workflows/metrics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: test-metrics-5m
on:
push:
branches: [master]
pull_request:
branches: [master]
workflow_dispatch:
jobs:
integration:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Start a kind cluster with containerd
uses: helm/kind-action@v1.4.0
with:
cluster_name: kind-${{ github.run_id }}
kubectl_version: "v1.25.2"
config: ./hack/ci/containerd-cluster-conf.yaml
- name: Build image
run: ./hack/ci/build.sh
- name: Set image version
run: |
echo "VALUE_FILE=charts/warm-metal-csi-driver/values.yaml" >> "$GITHUB_ENV"
echo "IMAGE_TAG=$(git rev-parse --short HEAD)" >> "$GITHUB_ENV"
echo "HELM_NAME=wm-csi-integration-tests" >> "$GITHUB_ENV"
- name: Install the CSI Driver
run: |
trap "kubectl -n kube-system describe po" ERR
helm install ${HELM_NAME} charts/warm-metal-csi-driver -n kube-system \
-f ${VALUE_FILE} \
--set csiPlugin.image.tag=${IMAGE_TAG} \
--wait \
--debug
- name: Test metrics
run: ./test/integration/test-metrics.sh
- name: Uninstall the CSI Driver
run: helm uninstall -n kube-system ${HELM_NAME} --wait
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION ?= v0.8.2
VERSION ?= v0.9.0

IMAGE_BUILDER ?= docker
IMAGE_BUILD_CMD ?= buildx
Expand Down
4 changes: 2 additions & 2 deletions charts/warm-metal-csi-driver/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.8.2
version: 0.9.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
appVersion: v0.8.2
appVersion: v0.9.0
3 changes: 3 additions & 0 deletions charts/warm-metal-csi-driver/templates/nodeplugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ spec:
- containerPort: 9809
name: metrics
protocol: TCP
- containerPort: 8080
name: metrics2
protocol: TCP
livenessProbe:
{{- toYaml .Values.csiPlugin.livenessProbe | nindent 12}}
securityContext:
Expand Down
9 changes: 9 additions & 0 deletions charts/warm-metal-csi-driver/templates/podmonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ spec:
{{- if .Values.podMonitor.timeout }}
scrapeTimeout: {{ .Values.podMonitor.timeout }}
{{- end }}
- path: /metrics
port: metrics2
scheme: http
{{- if .Values.podMonitor.interval }}
interval: {{ .Values.podMonitor.interval }}
{{- end }}
{{- if .Values.podMonitor.timeout }}
scrapeTimeout: {{ .Values.podMonitor.timeout }}
{{- end }}
jobLabel: {{ include "warm-metal-csi-driver.fullname" . }}
namespaceSelector:
matchNames:
Expand Down
2 changes: 2 additions & 0 deletions cmd/plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/warm-metal/csi-driver-image/pkg/backend/containerd"
"github.com/warm-metal/csi-driver-image/pkg/backend/crio"
"github.com/warm-metal/csi-driver-image/pkg/cri"
"github.com/warm-metal/csi-driver-image/pkg/metrics"
"github.com/warm-metal/csi-driver-image/pkg/secret"
"github.com/warm-metal/csi-driver-image/pkg/watcher"
csicommon "github.com/warm-metal/csi-drivers/pkg/csi-common"
Expand Down Expand Up @@ -145,5 +146,6 @@ func main() {
)
}

metrics.StartMetricsServer(metrics.RegisterMetrics())
server.Wait()
}
3 changes: 3 additions & 0 deletions cmd/plugin/node_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/container-storage-interface/spec/lib/go/csi"
"github.com/containerd/containerd/reference/docker"
"github.com/warm-metal/csi-driver-image/pkg/backend"
"github.com/warm-metal/csi-driver-image/pkg/metrics"
"github.com/warm-metal/csi-driver-image/pkg/mountexecutor"
"github.com/warm-metal/csi-driver-image/pkg/mountstatus"
"github.com/warm-metal/csi-driver-image/pkg/pullexecutor"
Expand Down Expand Up @@ -189,6 +190,8 @@ func (n NodeServer) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpubl
}

if err = n.mounter.Unmount(ctx, req.VolumeId, backend.MountTarget(req.TargetPath)); err != nil {
// TODO(vadasambar): move this to mountexecutor once mountexecutor has `StartUnmounting` function
metrics.OperationErrorsCount.WithLabelValues("StartUnmounting").Inc()
err = status.Error(codes.Internal, err.Error())
return
}
Expand Down
166 changes: 164 additions & 2 deletions cmd/plugin/node_server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@ package main
import (
"context"
"fmt"
"io"
"net"
"net/http"
"net/url"
"os"
"strings"
Expand All @@ -17,6 +19,7 @@ import (
"github.com/warm-metal/csi-driver-image/pkg/backend"
"github.com/warm-metal/csi-driver-image/pkg/backend/containerd"
"github.com/warm-metal/csi-driver-image/pkg/cri"
"github.com/warm-metal/csi-driver-image/pkg/metrics"
csicommon "github.com/warm-metal/csi-drivers/pkg/csi-common"
"google.golang.org/grpc"
"k8s.io/apimachinery/pkg/util/wait"
Expand All @@ -43,7 +46,7 @@ func TestNodePublishVolumeAsync(t *testing.T) {
asyncImagePulls := true
ns := NewNodeServer(driver, mounter, criClient, &testSecretStore{}, asyncImagePulls)

// based on kubelet's csi mounter pluginc ode
// based on kubelet's csi mounter plugin code
// check https://github.com/kubernetes/kubernetes/blob/b06a31b87235784bad2858be62115049b6eb6bcd/pkg/volume/csi/csi_mounter.go#L111-L112
timeout := 100 * time.Millisecond

Expand Down Expand Up @@ -166,7 +169,7 @@ func TestNodePublishVolumeSync(t *testing.T) {
asyncImagePulls := false
ns := NewNodeServer(driver, mounter, criClient, &testSecretStore{}, asyncImagePulls)

// based on kubelet's csi mounter pluginc ode
// based on kubelet's csi mounter plugin code
// check https://github.com/kubernetes/kubernetes/blob/b06a31b87235784bad2858be62115049b6eb6bcd/pkg/volume/csi/csi_mounter.go#L111-L112
timeout := 100 * time.Millisecond

Expand Down Expand Up @@ -273,6 +276,165 @@ func TestNodePublishVolumeSync(t *testing.T) {
assert.ErrorContains(t, err, "not found")
}

// Check test/integration/node-server/README.md for how to run this test correctly
func TestMetrics(t *testing.T) {
socketAddr := "unix:///run/containerd/containerd.sock"
addr, err := url.Parse(socketAddr)
assert.NoError(t, err)

criClient, err := cri.NewRemoteImageService(socketAddr, time.Minute)
assert.NoError(t, err)
assert.NotNil(t, criClient)

mounter := containerd.NewMounter(addr.Path)
assert.NotNil(t, mounter)

driver := csicommon.NewCSIDriver(driverName, driverVersion, "fake-node")
assert.NotNil(t, driver)

asyncImagePulls := true
ns := NewNodeServer(driver, mounter, criClient, &testSecretStore{}, asyncImagePulls)

// based on kubelet's csi mounter plugin code
// check https://github.com/kubernetes/kubernetes/blob/b06a31b87235784bad2858be62115049b6eb6bcd/pkg/volume/csi/csi_mounter.go#L111-L112
timeout := 10 * time.Second

server := csicommon.NewNonBlockingGRPCServer()

addr, err = url.Parse(*endpoint)
assert.NoError(t, err)

os.Remove("/csi/csi.sock")

// automatically deleted when the server is stopped
f, err := os.Create("/csi/csi.sock")
assert.NoError(t, err)
assert.NotNil(t, f)

var wg sync.WaitGroup
wg.Add(1)
go func() {
metrics.StartMetricsServer(metrics.RegisterMetrics())

server.Start(*endpoint,
nil,
nil,
ns)
// wait for the GRPC server to start
wg.Done()
server.Wait()
}()

// give some time for server to start
time.Sleep(2 * time.Second)
defer func() {
klog.Info("server was stopped")
server.Stop()
}()

wg.Wait()
var conn *grpc.ClientConn

conn, err = grpc.Dial(
addr.Path,
grpc.WithInsecure(),
grpc.WithContextDialer(func(ctx context.Context, targetPath string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, "unix", targetPath)
}),
)

if err != nil {
panic(err)
}

assert.NoError(t, err)
assert.NotNil(t, conn)

nodeClient := csipbv1.NewNodeClient(conn)
assert.NotNil(t, nodeClient)

ctx, cancel := context.WithTimeout(context.Background(), 3*timeout)
defer cancel()
// wrong image id
wrongVolId := "docker.io-doesnt-exist/library/redis-doesnt-exist:latest"
wrongTargetPath := "wrong-test-path"
wrongReq := &csi.NodePublishVolumeRequest{
VolumeId: wrongVolId,
TargetPath: wrongTargetPath,
VolumeContext: map[string]string{
// so that the test would always attempt to pull an image
ctxKeyPullAlways: "true",
},
VolumeCapability: &csi.VolumeCapability{
AccessMode: &csi.VolumeCapability_AccessMode{
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY,
},
},
}

r, err := nodeClient.NodePublishVolume(ctx, wrongReq)
assert.Error(t, err)
assert.Nil(t, r)

volId := "docker.io/library/redis:latest"
targetPath := "test-path"
req := &csi.NodePublishVolumeRequest{
VolumeId: volId,
TargetPath: targetPath,
VolumeContext: map[string]string{
// so that the test would always attempt to pull an image
ctxKeyPullAlways: "true",
},
VolumeCapability: &csi.VolumeCapability{
AccessMode: &csi.VolumeCapability_AccessMode{
Mode: csi.VolumeCapability_AccessMode_SINGLE_NODE_READER_ONLY,
},
},
}

condFn := func() (done bool, err error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
resp, err := nodeClient.NodePublishVolume(ctx, req)
if err != nil && strings.Contains(err.Error(), context.DeadlineExceeded.Error()) {
klog.Errorf("context deadline exceeded; retrying: %v", err)
return false, nil
}
if resp != nil {
return true, nil
}
return false, fmt.Errorf("response from `NodePublishVolume` is nil")
}

err = wait.PollImmediate(
timeout,
30*time.Second,
condFn)
assert.NoError(t, err)

resp, err := http.Get("http://:8080/metrics")
assert.NoError(t, err)
assert.NotNil(t, resp)
assert.Equal(t, http.StatusOK, resp.StatusCode)

b1, err := io.ReadAll(resp.Body)
assert.NoError(t, err)
respBody := string(b1)
assert.Contains(t, respBody, metrics.ImagePullTimeKey)
assert.Contains(t, respBody, metrics.ImageMountTimeKey)
assert.Contains(t, respBody, metrics.OperationErrorsCountKey)

// give some time before stopping the server
time.Sleep(5 * time.Second)

// unmount if the volume is already mounted
c, ca := context.WithTimeout(context.Background(), time.Second*10)
defer ca()

err = mounter.Unmount(c, volId, backend.MountTarget(targetPath))
assert.NoError(t, err)
}

type testSecretStore struct {
}

Expand Down
61 changes: 61 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package metrics

import (
"net/http"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
)

const Async = "async"
const Sync = "sync"
const ImagePullTimeKey = "pull_duration_seconds"
const ImageMountTimeKey = "mount_duration_seconds"
const OperationErrorsCountKey = "operation_errors_total"

var ImagePullTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: "warm_metal",
Name: ImagePullTimeKey,
Help: "The time it took to pull an image",
Buckets: []float64{0, 1, 5, 10, 15, 30, 60, 120, 180},
},
[]string{"operation_type"},
)

var ImageMountTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: "warm_metal",
Name: ImageMountTimeKey,
Help: "The time it took to mount an image",
Buckets: []float64{0, 1, 5, 10, 15, 30, 60, 120, 180},
},
[]string{"operation_type"},
)

var OperationErrorsCount = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: "warm_metal",
Name: OperationErrorsCountKey,
Help: "Cumulative number of operation (pull,mount,unmount) errors in the driver",
},
[]string{"operation_type"},
)

func RegisterMetrics() *prometheus.Registry {
reg := prometheus.NewRegistry()
reg.MustRegister(ImagePullTime)
reg.MustRegister(ImageMountTime)
reg.MustRegister(OperationErrorsCount)

return reg
}

func StartMetricsServer(reg *prometheus.Registry) {
go func() {
http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg}))
klog.Info("serving internal metrics at port 8080")
klog.Fatal(http.ListenAndServe(":8080", nil))
}()
}
Loading

0 comments on commit 8c0cf34

Please sign in to comment.