Skip to content

Commit

Permalink
MTSRE-1521: Collect metrics for reconcile errors (#471)
Browse files Browse the repository at this point in the history
* MTSRE-1521: Collect metrics for reconcile errors

* updated with review requests
  • Loading branch information
pepedocs authored Nov 2, 2023
1 parent 61fd0e6 commit 9cefc13
Show file tree
Hide file tree
Showing 18 changed files with 409 additions and 43 deletions.
1 change: 1 addition & 0 deletions cmd/addon-operator-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ func initReconcilers(mgr ctrl.Manager,
aictrl.WithLog{Log: addonInstancePhaseLog.WithName("checkHeartbeat")},
),
},
aictrl.WithRecorder{Recorder: recorder},
)

if err := addonInstanceCtrl.SetupWithManager(mgr); err != nil {
Expand Down
7 changes: 7 additions & 0 deletions internal/controllers/addon/addon_deletion_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
ctrl "sigs.k8s.io/controller-runtime"

addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
"github.com/openshift/addon-operator/internal/controllers"
"github.com/openshift/addon-operator/internal/metrics"
)

const (
Expand All @@ -34,6 +36,7 @@ func (c defaultClock) Now() time.Time {
type addonDeletionReconciler struct {
clock clock
handlers []addonDeletionHandler
recorder *metrics.Recorder
}

func (r *addonDeletionReconciler) Reconcile(ctx context.Context, addon *addonsv1alpha1.Addon) (ctrl.Result, error) {
Expand All @@ -52,13 +55,17 @@ func (r *addonDeletionReconciler) Reconcile(ctx context.Context, addon *addonsv1
// We set ReadyToBeDeleted=false status condition in response to the delete signal received from OCM.
reportAddonReadyToBeDeletedStatus(addon, metav1.ConditionFalse)

reconErr := metrics.NewReconcileError("addon", r.recorder, true)

for _, handler := range r.handlers {
if err := handler.NotifyAddon(ctx, addon); err != nil {
err = reconErr.Join(err, controllers.ErrNotifyAddon)
return ctrl.Result{}, err
}
// If ack is received from the underlying addon, we report ReadyToBeDeleted = true.
ackReceived, err := handler.AckReceivedFromAddon(ctx, addon)
if err != nil {
err = reconErr.Join(err, controllers.ErrAckReceivedFromAddon)
return ctrl.Result{}, err
}
if ackReceived {
Expand Down
14 changes: 9 additions & 5 deletions internal/controllers/addon/addon_instance_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"fmt"

"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
apiErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
ctrl "sigs.k8s.io/controller-runtime"
Expand All @@ -15,20 +15,24 @@ import (

addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
"github.com/openshift/addon-operator/internal/controllers"
"github.com/openshift/addon-operator/internal/metrics"
)

const ADDON_INSTANCE_RECONCILER_NAME = "addonInstanceReconciler"

type addonInstanceReconciler struct {
client client.Client
scheme *runtime.Scheme
client client.Client
scheme *runtime.Scheme
recorder *metrics.Recorder
}

func (r *addonInstanceReconciler) Reconcile(ctx context.Context,
addon *addonsv1alpha1.Addon) (reconcile.Result, error) {
reconErr := metrics.NewReconcileError("addon", r.recorder, true)
// Ensure the creation of the corresponding AddonInstance in .spec.install.olmOwnNamespace/.spec.install.olmAllNamespaces namespace
if err := r.ensureAddonInstance(ctx, addon); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to ensure the creation of addoninstance: %w", err)
err = reconErr.Join(err, controllers.ErrEnsureCreateAddonInstance)
return ctrl.Result{}, err
}
return reconcile.Result{}, nil
}
Expand Down Expand Up @@ -73,7 +77,7 @@ func (r *addonInstanceReconciler) reconcileAddonInstance(
ctx context.Context, desiredAddonInstance *addonsv1alpha1.AddonInstance) error {
currentAddonInstance := &addonsv1alpha1.AddonInstance{}
err := r.client.Get(ctx, client.ObjectKeyFromObject(desiredAddonInstance), currentAddonInstance)
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
return r.client.Create(ctx, desiredAddonInstance)
}
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions internal/controllers/addon/addon_reconciler_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ func (w WithPackageOperatorReconciler) ApplyToAddonReconciler(config *AddonRecon
Scheme: w.Scheme,
ClusterID: config.ClusterExternalID,
OcmClusterInfo: config.GetOCMClusterInfo,
recorder: config.Recorder,
}
config.subReconcilers = append(config.subReconcilers, poReconciler)
}
Expand Down
28 changes: 22 additions & 6 deletions internal/controllers/addon/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,35 +97,41 @@ func NewAddonReconciler(
&legacyDeletionHandler{client: client, uncachedClient: uncachedClient},
&addonInstanceDeletionHandler{client: client},
},
recorder: recorder,
},
// Step 2: Reconcile Namespace
&namespaceReconciler{
client: client,
scheme: scheme,
client: client,
scheme: scheme,
recorder: recorder,
},
// Step 3: Reconcile Addon pull secrets
&addonSecretPropagationReconciler{
cachedClient: client,
uncachedClient: uncachedClient,
scheme: scheme,
addonOperatorNamespace: addonOperatorNamespace,
recorder: recorder,
},
// Step 4: Reconcile AddonInstance object
&addonInstanceReconciler{
client: client,
scheme: scheme,
client: client,
scheme: scheme,
recorder: recorder,
},
// Step 5: Reconcile OLM objects
&olmReconciler{
client: client,
uncachedClient: uncachedClient,
scheme: scheme,
operatorResourceHandler: operatorResourceHandler,
recorder: recorder,
},
// Step 6: Reconcile Monitoring Federation
&monitoringFederationReconciler{
client: client,
scheme: scheme,
client: client,
scheme: scheme,
recorder: recorder,
},
},
}
Expand Down Expand Up @@ -273,9 +279,11 @@ func (r *AddonReconciler) Reconcile(
) (ctrl.Result, error) {
logger := r.Log.WithValues("addon", req.NamespacedName.String())
ctx = controllers.ContextWithLogger(ctx, logger)
reconErr := metrics.NewReconcileError("addon", r.Recorder, false)

addon := &addonsv1alpha1.Addon{}
if err := r.Get(ctx, req.NamespacedName, addon); err != nil {
reconErr.Report(controllers.ErrGetAddon, addon.Name)
return ctrl.Result{}, client.IgnoreNotFound(err)
}

Expand All @@ -287,6 +295,10 @@ func (r *AddonReconciler) Reconcile(
}
errors := r.syncWithExternalAPIs(ctx, logger, addon)

if errors.ErrorOrNil() != nil {
reconErr.Report(controllers.ErrSyncWithExternalAPIs, addon.Name)
}

// append reconcilerErr
errors = multierror.Append(errors, reconcileErr)

Expand Down Expand Up @@ -322,6 +334,8 @@ func (r *AddonReconciler) reconcile(ctx context.Context, addon *addonsv1alpha1.A
log logr.Logger,
) (ctrl.Result, error) {
ctx = controllers.ContextWithLogger(ctx, log)
reconErr := metrics.NewReconcileError("addon", r.Recorder, false)
subReconErr := metrics.NewReconcileError("addon", r.Recorder, true)
// Handle addon deletion before checking for pause condition.
// This allows even paused addons to be deleted.
if !addon.DeletionTimestamp.IsZero() {
Expand Down Expand Up @@ -362,13 +376,15 @@ func (r *AddonReconciler) reconcile(ctx context.Context, addon *addonsv1alpha1.A
if !controllerutil.ContainsFinalizer(addon, cacheFinalizer) {
controllerutil.AddFinalizer(addon, cacheFinalizer)
if err := r.Update(ctx, addon); err != nil {
reconErr.Report(controllers.ErrUpdateAddon, addon.Name)
return ctrl.Result{}, fmt.Errorf("failed to add finalizer: %w", err)
}
}

// Run each sub reconciler serially
for _, reconciler := range r.subReconcilers {
if result, err := reconciler.Reconcile(ctx, addon); err != nil {
subReconErr.Report(err, addon.Name)
return ctrl.Result{}, fmt.Errorf("%s : failed to reconcile : %w", reconciler.Name(), err)
} else if !result.IsZero() {
return result, nil
Expand Down
43 changes: 40 additions & 3 deletions internal/controllers/addon/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

promTestUtil "github.com/prometheus/client_golang/prometheus/testutil"

addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
"github.com/openshift/addon-operator/internal/controllers"
"github.com/openshift/addon-operator/internal/metrics"
"github.com/openshift/addon-operator/internal/ocm"
"github.com/openshift/addon-operator/internal/ocm/ocmtest"
"github.com/openshift/addon-operator/internal/testutil"
Expand All @@ -26,7 +30,10 @@ type reconcileErrorTestCase struct {
statusUpdateErrPresent bool
}

var _ addonReconciler = (*mockSubReconciler)(nil)
var (
_ addonReconciler = (*mockSubReconciler)(nil)
errMockSubReconcile = errors.New("failed to reconcile")
)

type mockSubReconciler struct {
returnErr bool
Expand All @@ -38,7 +45,7 @@ func (m *mockSubReconciler) Name() string {

func (m *mockSubReconciler) Reconcile(ctx context.Context, addon *addonsv1alpha1.Addon) (ctrl.Result, error) {
if m.returnErr {
return ctrl.Result{}, errors.New("failed to reconcile")
return ctrl.Result{}, errMockSubReconcile
}
return ctrl.Result{}, nil
}
Expand Down Expand Up @@ -86,13 +93,17 @@ func TestReconcileErrorHandling(t *testing.T) {
statusUpdateErrPresent: true,
},
}
for _, testCase := range testCases {

for idx, testCase := range testCases {
client := testutil.NewClient()
ocmClient := ocmtest.NewClient()
recorder := metrics.NewRecorder(true, fmt.Sprintf("clusterID-%v", idx))

r := AddonReconciler{
Client: client,
ocmClient: ocmClient,
Log: logr.Discard(),
Recorder: recorder,
subReconcilers: []addonReconciler{},
}

Expand Down Expand Up @@ -136,6 +147,32 @@ func TestReconcileErrorHandling(t *testing.T) {
assert.True(t, ok, "expected multi error")
assert.Equal(t, expectedNumErrors(testCase), multiErr.Len())
}

metric := recorder.GetReconcileErrorMetric()
assert.NotNil(t, metric)

if testCase.externalAPISyncErrPresent {
// Ensure sync error during reconcile was collected as a metric
controllerMetricVal := promTestUtil.ToFloat64(
metric.WithLabelValues(
"addon",
controllers.ErrSyncWithExternalAPIs.Error(),
addon.Name,
),
)
assert.True(t, controllerMetricVal > 0)
}
if testCase.reconcilerErrPresent {
// Ensure reconcile error was collected as a metric
controllerMetricVal := promTestUtil.ToFloat64(
metric.WithLabelValues(
"addon",
errMockSubReconcile.Error(),
addon.Name,
),
)
assert.True(t, controllerMetricVal > 0)
}
}
}

Expand Down
16 changes: 12 additions & 4 deletions internal/controllers/addon/monitoring_federation_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,21 @@ import (

addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
"github.com/openshift/addon-operator/internal/controllers"
"github.com/openshift/addon-operator/internal/metrics"
)

const MONITORING_FEDERATION_RECONCILER_NAME = "monitoringFederationReconciler"

type monitoringFederationReconciler struct {
client client.Client
scheme *runtime.Scheme
client client.Client
scheme *runtime.Scheme
recorder *metrics.Recorder
}

func (r *monitoringFederationReconciler) Reconcile(ctx context.Context,
addon *addonsv1alpha1.Addon) (ctrl.Result, error) {
log := controllers.LoggerFromContext(ctx)
reconErr := metrics.NewReconcileError("addon", r.recorder, true)

// Possibly ensure monitoring federation
// Normally this would be configured before the addon workload is installed
Expand All @@ -44,14 +47,19 @@ func (r *monitoringFederationReconciler) Reconcile(ctx context.Context,

return ctrl.Result{}, nil
} else if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to ensure ServiceMonitor: %w", err)
err = reconErr.Join(err, controllers.ErrEnsureCreateServiceMonitor)
return ctrl.Result{}, err
} else if !result.IsZero() {
return result, nil
}

// Remove possibly unwanted monitoring federation
if err := r.ensureDeletionOfUnwantedMonitoringFederation(ctx, addon); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to ensure deletion of unwanted ServiceMonitors: %w", err)
err = reconErr.Join(
err,
controllers.ErrEnsureDeleteServiceMonitor,
)
return ctrl.Result{}, err
}
return reconcile.Result{}, nil
}
Expand Down
8 changes: 6 additions & 2 deletions internal/controllers/addon/monitoring_stack_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,17 @@ import (

addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
"github.com/openshift/addon-operator/internal/controllers"
"github.com/openshift/addon-operator/internal/metrics"
)

const MONITORING_STACK_RECONCILER_NAME = "monitoringStackReconciler"

var errMonitoringStackSpecNotFound = fmt.Errorf("monitoring stack spec not found")

type monitoringStackReconciler struct {
client client.Client
scheme *runtime.Scheme
client client.Client
scheme *runtime.Scheme
recorder *metrics.Recorder
}

func (r *monitoringStackReconciler) Name() string {
Expand All @@ -37,13 +39,15 @@ func (r *monitoringStackReconciler) Name() string {

func (r *monitoringStackReconciler) Reconcile(ctx context.Context,
addon *addonsv1alpha1.Addon) (ctrl.Result, error) {
reconErr := metrics.NewReconcileError("addon", r.recorder, true)

// ensure creation of MonitoringStack object
latestMonitoringStack, err := r.ensureMonitoringStack(ctx, addon)
if err != nil {
if errors.Is(err, errMonitoringStackSpecNotFound) {
return reconcile.Result{}, nil
}
err = reconErr.Join(err, controllers.ErrEnsureCreateMonitoringStack)
return reconcile.Result{}, err
}

Expand Down
6 changes: 4 additions & 2 deletions internal/controllers/addon/namespace_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ import (

addonsv1alpha1 "github.com/openshift/addon-operator/apis/addons/v1alpha1"
"github.com/openshift/addon-operator/internal/controllers"
"github.com/openshift/addon-operator/internal/metrics"
)

const NAMESPACE_RECONCILER_NAME = "namespaceReconciler"

type namespaceReconciler struct {
client client.Client
scheme *runtime.Scheme
client client.Client
scheme *runtime.Scheme
recorder *metrics.Recorder
}

func (r *namespaceReconciler) Reconcile(ctx context.Context,
Expand Down
Loading

0 comments on commit 9cefc13

Please sign in to comment.