diff --git a/docs/latest/user/gateway-api-metrics.md b/docs/latest/user/gateway-api-metrics.md new file mode 100644 index 00000000000..e403c90404f --- /dev/null +++ b/docs/latest/user/gateway-api-metrics.md @@ -0,0 +1,52 @@ +# Gateway API Metrics + +Resource metrics for Gateway API objects are available using the [Gateway API State Metrics](https://github.com/Kuadrant/gateway-api-state-metrics) project. +The project also provides example dashboard for visualising the metrics using Grafana, and example alerts using Prometheus & Alertmanager. + +## Prerequisites + +Follow the steps from the [Quickstart Guide](quickstart.md) to install Envoy Gateway and the example manifest. +Before proceeding, you should be able to query the example backend using HTTP. + +Run the following commands to install the metrics stack, with the Gateway API State Metrics configuration, on your kubernetes cluster: + +```shell +kubectl apply --server-side -f https://raw.githubusercontent.com/Kuadrant/gateway-api-state-metrics/main/config/examples/kube-prometheus/bundle_crd.yaml +kubectl apply -f https://raw.githubusercontent.com/Kuadrant/gateway-api-state-metrics/main/config/examples/kube-prometheus/bundle.yaml +``` + +## Metrics and Alerts + +To access the Prometheus UI, wait for the statefulset to be ready, then use the port-forward command: + +```shell +# This first command may fail if the statefulset has not been created yet. +# In that case, try again until you get a message like 'Waiting for 2 pods to be ready...' +# or 'statefulset rolling update complete 2 pods...' +kubectl -n monitoring rollout status --watch --timeout=5m statefulset/prometheus-k8s +kubectl -n monitoring port-forward service/prometheus-k8s 9090:9090 > /dev/null & +``` + +Navigate to [http://localhost:9090](http://localhost:9090). +Metrics can be queried from the 'Graph' tab e.g. `gatewayapi_gateway_created` +See the [Gateway API State Metrics README](https://github.com/Kuadrant/gateway-api-state-metrics/tree/main#metrics) for the full list of Gateway API metrics available. + +Alerts can be see in the 'Alerts' tab. +Gateway API specific alerts will be grouped under the 'gateway-api.rules' heading. + +***Note:*** Alerts are defined in a PrometheusRules custom resource in the 'monitoring' namespace. You can modify the alert rules by updating this resource. + +## Dashboards + +To view the dashboards in Grafana, wait for the deployment to be ready, then use the port-forward command: + +```shell +kubectl -n monitoring wait --timeout=5m deployment/grafana --for=condition=Available +kubectl -n monitoring port-forward service/grafana 3000:3000 > /dev/null & +``` + +Navigate to [http://localhost:3000](http://localhost:3000) and sign in with admin/admin. +The Gateway API State dashboards will be available in the 'Default' folder and tagged with 'gateway-api'. +See the [Gateway API State Metrics README](https://github.com/Kuadrant/gateway-api-state-metrics/tree/main#dashboards) for further information on available dashboards. + +***Note:*** Dashboards are loaded from configmaps. You can modify the dashboards in the Grafana UI, however you will need to export them from the UI and update the json in the configmaps to persist changes. diff --git a/docs/latest/user_docs.rst b/docs/latest/user_docs.rst index 21d8e9c0f6d..8bd0c4ccfa9 100644 --- a/docs/latest/user_docs.rst +++ b/docs/latest/user_docs.rst @@ -30,5 +30,6 @@ Learn how to deploy, use, and operate Envoy Gateway. user/deployment-mode user/gateway-address user/gatewayapi-support + user/gateway-api-metrics user/proxy-observability user/multicluster-service diff --git a/internal/globalratelimit/runner/runner.go b/internal/globalratelimit/runner/runner.go index b8615cbeb83..4d4c41b4feb 100644 --- a/internal/globalratelimit/runner/runner.go +++ b/internal/globalratelimit/runner/runner.go @@ -81,7 +81,7 @@ func (r *Runner) Start(ctx context.Context) error { discoveryv3.RegisterAggregatedDiscoveryServiceServer(r.grpc, serverv3.NewServer(ctx, r.cache, cb)) // Start and listen xDS gRPC config Server. - go r.serverXdsConfigServer(ctx) + go r.serveXdsConfigServer(ctx) // Start message Subscription. go r.subscribeAndTranslate(ctx) @@ -90,20 +90,23 @@ func (r *Runner) Start(ctx context.Context) error { return nil } -func (r *Runner) serverXdsConfigServer(ctx context.Context) { +func (r *Runner) serveXdsConfigServer(ctx context.Context) { addr := net.JoinHostPort(XdsGrpcSotwConfigServerAddress, strconv.Itoa(ratelimit.XdsGrpcSotwConfigServerPort)) l, err := net.Listen("tcp", addr) if err != nil { r.Logger.Error(err, "failed to listen on address", "address", addr) return } + + go func() { + <-ctx.Done() + r.Logger.Info("grpc server shutting down") + r.grpc.Stop() + }() + if err = r.grpc.Serve(l); err != nil { r.Logger.Error(err, "failed to start grpc based xds config server") } - - <-ctx.Done() - r.Logger.Info("grpc config server shutting down") - r.grpc.Stop() } func (r *Runner) subscribeAndTranslate(ctx context.Context) { diff --git a/internal/provider/kubernetes/routes.go b/internal/provider/kubernetes/routes.go index d3d9fbea6d4..cc33db4e4f2 100644 --- a/internal/provider/kubernetes/routes.go +++ b/internal/provider/kubernetes/routes.go @@ -182,6 +182,7 @@ func (r *gatewayAPIReconciler) processGRPCRoutes(ctx context.Context, gatewayNam authFilter, ok := resourceMap.authenFilters[key] if !ok { r.log.Error(err, "AuthenticationFilter not found; bypassing rule", "index", i) + continue } resourceTree.AuthenticationFilters = append(resourceTree.AuthenticationFilters, authFilter) @@ -193,6 +194,7 @@ func (r *gatewayAPIReconciler) processGRPCRoutes(ctx context.Context, gatewayNam rateLimitFilter, ok := resourceMap.rateLimitFilters[key] if !ok { r.log.Error(err, "RateLimitFilter not found; bypassing rule", "index", i) + continue } resourceTree.RateLimitFilters = append(resourceTree.RateLimitFilters, rateLimitFilter) diff --git a/internal/xds/server/runner/runner.go b/internal/xds/server/runner/runner.go index ee285c133f2..af9117c8841 100644 --- a/internal/xds/server/runner/runner.go +++ b/internal/xds/server/runner/runner.go @@ -102,18 +102,20 @@ func (r *Runner) serveXdsServer(ctx context.Context) { r.Logger.Error(err, "failed to listen on address", "address", addr) return } - err = r.grpc.Serve(l) - if err != nil { + + go func() { + <-ctx.Done() + r.Logger.Info("grpc server shutting down") + // We don't use GracefulStop here because envoy + // has long-lived hanging xDS requests. There's no + // mechanism to make those pending requests fail, + // so we forcibly terminate the TCP sessions. + r.grpc.Stop() + }() + + if err = r.grpc.Serve(l); err != nil { r.Logger.Error(err, "failed to start grpc based xds server") } - - <-ctx.Done() - r.Logger.Info("grpc server shutting down") - // We don't use GracefulStop here because envoy - // has long-lived hanging xDS requests. There's no - // mechanism to make those pending requests fail, - // so we forcibly terminate the TCP sessions. - r.grpc.Stop() } // registerServer registers the given xDS protocol Server with the gRPC