From 6c27772dd6e19248c3cfcd242901e40b516eabb4 Mon Sep 17 00:00:00 2001 From: Jonathan West Date: Mon, 15 Apr 2024 13:22:25 -0400 Subject: [PATCH 1/5] Add sanity test of reconcile metrics after e2e tests Signed-off-by: Jonathan West --- hack/run-rollouts-manager-e2e-tests.sh | 130 ++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 3 deletions(-) diff --git a/hack/run-rollouts-manager-e2e-tests.sh b/hack/run-rollouts-manager-e2e-tests.sh index 03d9304..09e842d 100755 --- a/hack/run-rollouts-manager-e2e-tests.sh +++ b/hack/run-rollouts-manager-e2e-tests.sh @@ -5,6 +5,62 @@ SCRIPTPATH="$( pwd -P )" +# Treat undefined variables as errors +set -u + +TMP_DIR=$(mktemp -d 2>/dev/null || mktemp -d -t 'mytmpdir') + + +extract_metrics_data() { + + # 1) Extract REST client get/put/post metrics + + # Example: the metrics from /metric endpoint look like this: + # rest_client_requests_total{code="200",host="api.pgqqd-novoo-oqu.pa43.p3.openshiftapps.com:443",method="GET"} 42 + # rest_client_requests_total{code="200",host="api.pgqqd-novoo-oqu.pa43.p3.openshiftapps.com:443",method="PUT"} 88 + # rest_client_requests_total{code="201",host="api.pgqqd-novoo-oqu.pa43.p3.openshiftapps.com:443",method="POST"} 110 + + curl http://localhost:8080/metrics -o "$TMP_DIR/rollouts-metric-endpoint-output.txt" + GET_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "GET" | rev | cut -d' ' -f1` + PUT_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "PUT" | rev | cut -d' ' -f1` + POST_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "POST" | rev | cut -d' ' -f1` + + + if [[ "$GET_REQUESTS" == "" ]]; then + GET_REQUESTS=0 + fi + if [[ "$POST_REQUESTS" == "" ]]; then + POST_REQUESTS=0 + fi + if [[ "$PUT_REQUESTS" == "" ]]; then + PUT_REQUESTS=0 + fi + + + # 2) Extract the # of RolloutManager reconciles + + # Example: the metrics from /metric endpoint look like this: + # controller_runtime_reconcile_total{controller="rolloutmanager",result="error"} 0 + # controller_runtime_reconcile_total{controller="rolloutmanager",result="requeue"} 0 + # controller_runtime_reconcile_total{controller="rolloutmanager",result="requeue_after"} 0 + # controller_runtime_reconcile_total{controller="rolloutmanager",result="success"} 135 + ERROR_RECONCILES=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "controller_runtime_reconcile_total" | grep "error" | rev | cut -d' ' -f1` + SUCCESS_RECONCILES=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "controller_runtime_reconcile_total" | grep "success" | rev | cut -d' ' -f1` + + if [[ "$ERROR_RECONCILES" == "" ]]; then + ERROR_RECONCILES=0 + fi + + if [[ "$SUCCESS_RECONCILES" == "" ]]; then + SUCCESS_RECONCILES=0 + fi + +} + + + + + cd "$SCRIPTPATH/.." set -o pipefail @@ -13,12 +69,24 @@ set -o pipefail kubectl get crd/servicemonitors.monitoring.coreos.com &> /dev/null retVal=$? if [ $retVal -ne 0 ]; then - # If the CRD is not found, apply the CRD YAML - kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/release-0.52/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml + # If the CRD is not found, apply the CRD YAML + kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/release-0.52/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml fi + +# Before the test starts, extract initial metrics values +extract_metrics_data + +INITIAL_GET_REQUESTS=$GET_REQUESTS +INITIAL_PUT_REQUESTS=$PUT_REQUESTS +INITIAL_POST_REQUESTS=$POST_REQUESTS +INITIAL_ERROR_RECONCILES=$ERROR_RECONCILES +INITIAL_SUCCESS_RECONCILES=$SUCCESS_RECONCILES + + set -ex + if [ "$NAMESPACE_SCOPED_ARGO_ROLLOUTS" == "true" ]; then go test -v -p=1 -timeout=30m -race -count=1 -coverprofile=coverage.out ./tests/e2e/namespace-scoped @@ -29,7 +97,6 @@ else fi - set +e # If the output from the E2E operator is available, then check it for errors @@ -51,3 +118,60 @@ if [ -f "/tmp/e2e-operator-run.log" ]; then exit 1 fi fi + + +extract_metrics_data + +set -x + +FINAL_GET_REQUESTS=$GET_REQUESTS +FINAL_PUT_REQUESTS=$PUT_REQUESTS +FINAL_POST_REQUESTS=$POST_REQUESTS +FINAL_ERROR_RECONCILES=$ERROR_RECONCILES +FINAL_SUCCESS_RECONCILES=$SUCCESS_RECONCILES + +DELTA_GET_REQUESTS=`expr $FINAL_GET_REQUESTS - $INITIAL_GET_REQUESTS` +DELTA_PUT_REQUESTS=`expr $FINAL_PUT_REQUESTS - $INITIAL_PUT_REQUESTS` +DELTA_POST_REQUESTS=`expr $FINAL_POST_REQUESTS - $INITIAL_POST_REQUESTS` + +DELTA_ERROR_RECONCILES=`expr $FINAL_ERROR_RECONCILES - $INITIAL_ERROR_RECONCILES` +DELTA_SUCCESS_RECONCILES=`expr $FINAL_SUCCESS_RECONCILES - $INITIAL_SUCCESS_RECONCILES` + + +if [[ "$DELTA_POST_REQUESTS" == "0" ]]; then + echo "Unexpected number of REST client post requests: should be at least 1" + exit 1 +fi + +# Sanity test the behaviour of the operator during the tests + +# The # of PUT requests should be less than 40% of the # of POST requests +# - If the number is higher, this implies we are updating the .status or .spec fields of resources more than is necessary. +PUT_REQUEST_PERCENT=`expr "$DELTA_PUT_REQUESTS"00 / $DELTA_POST_REQUESTS` + +if [[ "`expr $PUT_REQUEST_PERCENT \> 40`" == "1" ]]; then + + echo "Put request %$PUT_REQUEST_PERCENT was greater than the expected value" + exit 1 + +fi + +if [[ "`expr $DELTA_ERROR_RECONCILES \> 20`" == "1" ]]; then + + echo "Number of Reconcile calls that returned an error $DELTA_ERROR_RECONCILES was greater than the expected value" + exit 1 + +fi + +if [[ "`expr $DELTA_SUCCESS_RECONCILES \> 200`" == "1" ]]; then + + echo "Number of Reconcile calls that returned success $DELTA_SUCCESS_RECONCILES was greater than the expected value" + exit 1 + +fi + + + + +# The # of reconcile calls should be within an expected +# - This may need to be updated as we add new E2E tests. \ No newline at end of file From 17c8b5f391a24c093084d8969068353b77f016e9 Mon Sep 17 00:00:00 2001 From: Jonathan West Date: Fri, 19 Apr 2024 15:24:07 -0400 Subject: [PATCH 2/5] Refactor metric gathering portion of E2E test script Signed-off-by: Jonathan West --- hack/run-rollouts-manager-e2e-tests.sh | 97 ++++++++++++-------------- 1 file changed, 46 insertions(+), 51 deletions(-) diff --git a/hack/run-rollouts-manager-e2e-tests.sh b/hack/run-rollouts-manager-e2e-tests.sh index 09e842d..eb5f48a 100755 --- a/hack/run-rollouts-manager-e2e-tests.sh +++ b/hack/run-rollouts-manager-e2e-tests.sh @@ -8,11 +8,10 @@ SCRIPTPATH="$( # Treat undefined variables as errors set -u -TMP_DIR=$(mktemp -d 2>/dev/null || mktemp -d -t 'mytmpdir') +extract_metrics_data() { + TMP_DIR=$(mktemp -d 2>/dev/null || mktemp -d -t 'mytmpdir') -extract_metrics_data() { - # 1) Extract REST client get/put/post metrics # Example: the metrics from /metric endpoint look like this: @@ -25,7 +24,6 @@ extract_metrics_data() { PUT_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "PUT" | rev | cut -d' ' -f1` POST_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "POST" | rev | cut -d' ' -f1` - if [[ "$GET_REQUESTS" == "" ]]; then GET_REQUESTS=0 fi @@ -58,9 +56,6 @@ extract_metrics_data() { } - - - cd "$SCRIPTPATH/.." set -o pipefail @@ -73,10 +68,8 @@ if [ $retVal -ne 0 ]; then kubectl apply -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/release-0.52/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml fi - -# Before the test starts, extract initial metrics values +# Before the test starts, extract initial metrics values from the /metrics endpoint of the operator extract_metrics_data - INITIAL_GET_REQUESTS=$GET_REQUESTS INITIAL_PUT_REQUESTS=$PUT_REQUESTS INITIAL_POST_REQUESTS=$POST_REQUESTS @@ -84,8 +77,9 @@ INITIAL_ERROR_RECONCILES=$ERROR_RECONCILES INITIAL_SUCCESS_RECONCILES=$SUCCESS_RECONCILES -set -ex +# Run the tests +set -ex if [ "$NAMESPACE_SCOPED_ARGO_ROLLOUTS" == "true" ]; then @@ -119,59 +113,60 @@ if [ -f "/tmp/e2e-operator-run.log" ]; then fi fi +# Sanity test the behaviour of the operator during the tests: +# - We check the (prometheus) metrics coming from the 'localhost:8080/metrics' endpoint of the operator. +# - For example, if the reported # of Reconcile calls was unusually high, this might mean that the operator was stuck in a Reconcile loop +# - Or, if the number of REST client POST requests (e.g. k8s objection creation) was equal to the number of PUT request (e.g. k8s object update), this may imply we are updating .status or .spec of an object too frequently. +sanity_test_metrics_data() { -extract_metrics_data - -set -x + extract_metrics_data -FINAL_GET_REQUESTS=$GET_REQUESTS -FINAL_PUT_REQUESTS=$PUT_REQUESTS -FINAL_POST_REQUESTS=$POST_REQUESTS -FINAL_ERROR_RECONCILES=$ERROR_RECONCILES -FINAL_SUCCESS_RECONCILES=$SUCCESS_RECONCILES + set -x -DELTA_GET_REQUESTS=`expr $FINAL_GET_REQUESTS - $INITIAL_GET_REQUESTS` -DELTA_PUT_REQUESTS=`expr $FINAL_PUT_REQUESTS - $INITIAL_PUT_REQUESTS` -DELTA_POST_REQUESTS=`expr $FINAL_POST_REQUESTS - $INITIAL_POST_REQUESTS` + FINAL_GET_REQUESTS=$GET_REQUESTS + FINAL_PUT_REQUESTS=$PUT_REQUESTS + FINAL_POST_REQUESTS=$POST_REQUESTS + FINAL_ERROR_RECONCILES=$ERROR_RECONCILES + FINAL_SUCCESS_RECONCILES=$SUCCESS_RECONCILES -DELTA_ERROR_RECONCILES=`expr $FINAL_ERROR_RECONCILES - $INITIAL_ERROR_RECONCILES` -DELTA_SUCCESS_RECONCILES=`expr $FINAL_SUCCESS_RECONCILES - $INITIAL_SUCCESS_RECONCILES` + DELTA_GET_REQUESTS=`expr $FINAL_GET_REQUESTS - $INITIAL_GET_REQUESTS` + DELTA_PUT_REQUESTS=`expr $FINAL_PUT_REQUESTS - $INITIAL_PUT_REQUESTS` + DELTA_POST_REQUESTS=`expr $FINAL_POST_REQUESTS - $INITIAL_POST_REQUESTS` + DELTA_ERROR_RECONCILES=`expr $FINAL_ERROR_RECONCILES - $INITIAL_ERROR_RECONCILES` + DELTA_SUCCESS_RECONCILES=`expr $FINAL_SUCCESS_RECONCILES - $INITIAL_SUCCESS_RECONCILES` -if [[ "$DELTA_POST_REQUESTS" == "0" ]]; then - echo "Unexpected number of REST client post requests: should be at least 1" - exit 1 -fi + if [[ "$DELTA_POST_REQUESTS" == "0" ]]; then + echo "Unexpected number of REST client post requests: should be at least 1" + exit 1 + fi -# Sanity test the behaviour of the operator during the tests + # The # of PUT requests should be less than 40% of the # of POST requests + # - If the number is higher, this implies we are updating the .status or .spec fields of resources more than is necessary. + PUT_REQUEST_PERCENT=`expr "$DELTA_PUT_REQUESTS"00 / $DELTA_POST_REQUESTS` -# The # of PUT requests should be less than 40% of the # of POST requests -# - If the number is higher, this implies we are updating the .status or .spec fields of resources more than is necessary. -PUT_REQUEST_PERCENT=`expr "$DELTA_PUT_REQUESTS"00 / $DELTA_POST_REQUESTS` + if [[ "`expr $PUT_REQUEST_PERCENT \> 40`" == "1" ]]; then + # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) -if [[ "`expr $PUT_REQUEST_PERCENT \> 40`" == "1" ]]; then + echo "Put request %$PUT_REQUEST_PERCENT was greater than the expected value" + exit 1 - echo "Put request %$PUT_REQUEST_PERCENT was greater than the expected value" - exit 1 - -fi - -if [[ "`expr $DELTA_ERROR_RECONCILES \> 20`" == "1" ]]; then - - echo "Number of Reconcile calls that returned an error $DELTA_ERROR_RECONCILES was greater than the expected value" - exit 1 - -fi - -if [[ "`expr $DELTA_SUCCESS_RECONCILES \> 200`" == "1" ]]; then + fi - echo "Number of Reconcile calls that returned success $DELTA_SUCCESS_RECONCILES was greater than the expected value" - exit 1 + if [[ "`expr $DELTA_ERROR_RECONCILES \> 20`" == "1" ]]; then + # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) + echo "Number of Reconcile calls that returned an error $DELTA_ERROR_RECONCILES was greater than the expected value" + exit 1 -fi + fi + if [[ "`expr $DELTA_SUCCESS_RECONCILES \> 200`" == "1" ]]; then + # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) + echo "Number of Reconcile calls that returned success $DELTA_SUCCESS_RECONCILES was greater than the expected value" + exit 1 + fi +} -# The # of reconcile calls should be within an expected -# - This may need to be updated as we add new E2E tests. \ No newline at end of file +sanity_test_metrics_data \ No newline at end of file From fc63a85d90f0742245a35d81ddbe50906a117126 Mon Sep 17 00:00:00 2001 From: Jonathan West Date: Tue, 27 Aug 2024 21:24:53 -0400 Subject: [PATCH 3/5] Refactor E2E script, reduce excess reconciling of resources Signed-off-by: Jonathan West --- controllers/configmap.go | 6 +- controllers/configmap_test.go | 3 +- controllers/resources.go | 22 ++++---- controllers/utils.go | 30 ++++++++++ controllers/utils_test.go | 61 ++++++++++++++------ hack/run-rollouts-manager-e2e-tests.sh | 77 ++++++++++++++++---------- 6 files changed, 137 insertions(+), 62 deletions(-) diff --git a/controllers/configmap.go b/controllers/configmap.go index 641be57..925826c 100644 --- a/controllers/configmap.go +++ b/controllers/configmap.go @@ -146,12 +146,12 @@ func (r *RolloutManagerReconciler) reconcileConfigMap(ctx context.Context, cr ro actualConfigMap.Data[TrafficRouterPluginConfigMapKey] = string(desiredTrafficRouterPluginString) actualConfigMap.Data[MetricPluginConfigMapKey] = string(desiredMetricPluginString) + log.Info("Updating Rollouts ConfigMap due to detected difference") + // Update the ConfigMap in the cluster if err := r.Client.Update(ctx, actualConfigMap); err != nil { return fmt.Errorf("failed to update ConfigMap: %v", err) } - log.Info("ConfigMap updated successfully") - // Restarting rollouts pod only if configMap is updated if err := r.restartRolloutsPod(ctx, cr.Namespace); err != nil { return err @@ -183,8 +183,8 @@ func (r *RolloutManagerReconciler) restartRolloutsPod(ctx context.Context, names for i := range podList.Items { pod := podList.Items[i] - log.Info("Deleting Rollouts Pod", "podName", pod.Name) if pod.ObjectMeta.DeletionTimestamp == nil { + log.Info("Deleting Rollouts Pod", "podName", pod.Name) if err := r.Client.Delete(ctx, &pod); err != nil { if errors.IsNotFound(err) { log.Info(fmt.Sprintf("Pod %s already deleted", pod.Name)) diff --git a/controllers/configmap_test.go b/controllers/configmap_test.go index d6bce90..70df5a5 100644 --- a/controllers/configmap_test.go +++ b/controllers/configmap_test.go @@ -9,7 +9,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" appsv1 "k8s.io/api/apps/v1" - v1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -19,7 +18,7 @@ var _ = Describe("ConfigMap Test", func() { var a v1alpha1.RolloutManager var r *RolloutManagerReconciler var sa *corev1.ServiceAccount - var existingDeployment *v1.Deployment + var existingDeployment *appsv1.Deployment const trafficrouterPluginLocation = "https://custom-traffic-plugin-location" const metricPluginLocation = "https://custom-metric-plugin-location" diff --git a/controllers/resources.go b/controllers/resources.go index f50572a..b88acc3 100644 --- a/controllers/resources.go +++ b/controllers/resources.go @@ -46,7 +46,7 @@ func (r *RolloutManagerReconciler) reconcileRolloutsServiceAccount(ctx context.C normalizedLiveServiceAccount := liveServiceAccount.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveServiceAccount.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveServiceAccount.Labels, expectedServiceAccount.Labels) || !reflect.DeepEqual(normalizedLiveServiceAccount.Annotations, expectedServiceAccount.Annotations) { + if !areStringMapsEqual(normalizedLiveServiceAccount.Labels, expectedServiceAccount.Labels) || !areStringMapsEqual(normalizedLiveServiceAccount.Annotations, expectedServiceAccount.Annotations) { updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of ServiceAccount %s do not match the expected state, hence updating it", liveServiceAccount.Name)) @@ -111,7 +111,7 @@ func (r *RolloutManagerReconciler) reconcileRolloutsRole(ctx context.Context, cr removeUserLabelsAndAnnotations(&normalizedLiveRole.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveRole.Labels, expectedRole.Labels) || !reflect.DeepEqual(normalizedLiveRole.Annotations, expectedRole.Annotations) { + if !areStringMapsEqual(normalizedLiveRole.Labels, expectedRole.Labels) || !areStringMapsEqual(normalizedLiveRole.Annotations, expectedRole.Annotations) { updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of Role %s do not match the expected state, hence updating it", liveRole.Name)) @@ -165,7 +165,7 @@ func (r *RolloutManagerReconciler) reconcileRolloutsClusterRole(ctx context.Cont normalizedLiveClusterRole := liveClusterRole.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveClusterRole.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveClusterRole.Labels, expectedClusterRole.Labels) || !reflect.DeepEqual(normalizedLiveClusterRole.Annotations, expectedClusterRole.Annotations) { + if !areStringMapsEqual(normalizedLiveClusterRole.Labels, expectedClusterRole.Labels) || !areStringMapsEqual(normalizedLiveClusterRole.Annotations, expectedClusterRole.Annotations) { updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of Role %s do not match the expected state, hence updating it", liveClusterRole.Name)) @@ -248,7 +248,7 @@ func (r *RolloutManagerReconciler) reconcileRolloutsRoleBinding(ctx context.Cont normalizedLiveRoleBinding := liveRoleBinding.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveRoleBinding.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveRoleBinding.Labels, expectedRoleBinding.Labels) || !reflect.DeepEqual(normalizedLiveRoleBinding.Annotations, expectedRoleBinding.Annotations) { + if !areStringMapsEqual(normalizedLiveRoleBinding.Labels, expectedRoleBinding.Labels) || !areStringMapsEqual(normalizedLiveRoleBinding.Annotations, expectedRoleBinding.Annotations) { updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of RoleBinding %s do not match the expected state, hence updating it", liveRoleBinding.Name)) @@ -327,7 +327,7 @@ func (r *RolloutManagerReconciler) reconcileRolloutsClusterRoleBinding(ctx conte normalizedLiveClusterRoleBinding := liveClusterRoleBinding.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveClusterRoleBinding.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveClusterRoleBinding.Labels, expectedClusterRoleBinding.Labels) || !reflect.DeepEqual(normalizedLiveClusterRoleBinding.Annotations, expectedClusterRoleBinding.Annotations) { + if !areStringMapsEqual(normalizedLiveClusterRoleBinding.Labels, expectedClusterRoleBinding.Labels) || !areStringMapsEqual(normalizedLiveClusterRoleBinding.Annotations, expectedClusterRoleBinding.Annotations) { updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of ClusterRoleBinding %s do not match the expected state, hence updating it", liveClusterRoleBinding.Name)) @@ -460,7 +460,7 @@ func (r *RolloutManagerReconciler) reconcileRolloutsAggregateToAdminClusterRole( normalizedLiveClusterRole := liveClusterRole.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveClusterRole.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveClusterRole.Labels, expectedClusterRole.Labels) || !reflect.DeepEqual(normalizedLiveClusterRole.Annotations, expectedClusterRole.Annotations) { + if !areStringMapsEqual(normalizedLiveClusterRole.Labels, expectedClusterRole.Labels) || !areStringMapsEqual(normalizedLiveClusterRole.Annotations, expectedClusterRole.Annotations) { updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of aggregated ClusterRole %s do not match the expected state, hence updating it", liveClusterRole.Name)) @@ -512,7 +512,7 @@ func (r *RolloutManagerReconciler) reconcileRolloutsAggregateToEditClusterRole(c normalizedLiveClusterRole := liveClusterRole.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveClusterRole.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveClusterRole.Labels, expectedClusterRole.Labels) || !reflect.DeepEqual(normalizedLiveClusterRole.Annotations, expectedClusterRole.Annotations) { + if !areStringMapsEqual(normalizedLiveClusterRole.Labels, expectedClusterRole.Labels) || !areStringMapsEqual(normalizedLiveClusterRole.Annotations, expectedClusterRole.Annotations) { updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of aggregated ClusterRole %s do not match the expected state, hence updating it", liveClusterRole.Name)) @@ -564,7 +564,8 @@ func (r *RolloutManagerReconciler) reconcileRolloutsAggregateToViewClusterRole(c normalizedLiveClusterRole := liveClusterRole.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveClusterRole.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveClusterRole.Labels, expectedClusterRole.Labels) || !reflect.DeepEqual(normalizedLiveClusterRole.Annotations, expectedClusterRole.Annotations) { + if !areStringMapsEqual(normalizedLiveClusterRole.Labels, expectedClusterRole.Labels) || !areStringMapsEqual(normalizedLiveClusterRole.Annotations, expectedClusterRole.Annotations) { + updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of aggregated ClusterRole %s do not match the expected state, hence updating it", liveClusterRole.Name)) @@ -702,7 +703,8 @@ func (r *RolloutManagerReconciler) reconcileRolloutsMetricsService(ctx context.C normalizedLiveService := liveService.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveService.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveService.Labels, expectedSvc.Labels) || !reflect.DeepEqual(normalizedLiveService.Annotations, expectedSvc.Annotations) { + if !areStringMapsEqual(normalizedLiveService.Labels, expectedSvc.Labels) || !areStringMapsEqual(normalizedLiveService.Annotations, expectedSvc.Annotations) { + updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of metrics Service %s do not match the expected state, hence updating it", liveService.Name)) @@ -778,7 +780,7 @@ func (r *RolloutManagerReconciler) reconcileRolloutsSecrets(ctx context.Context, normalizedLiveSecret := liveSecret.DeepCopy() removeUserLabelsAndAnnotations(&normalizedLiveSecret.ObjectMeta, cr) - if !reflect.DeepEqual(normalizedLiveSecret.Labels, expectedSecret.Labels) || !reflect.DeepEqual(normalizedLiveSecret.Annotations, expectedSecret.Annotations) { + if !areStringMapsEqual(normalizedLiveSecret.Labels, expectedSecret.Labels) || !areStringMapsEqual(normalizedLiveSecret.Annotations, expectedSecret.Annotations) { updateNeeded = true log.Info(fmt.Sprintf("Labels/Annotations of Secret %s do not match the expected state, hence updating it", liveSecret.Name)) diff --git a/controllers/utils.go b/controllers/utils.go index 4a08a8a..205d013 100644 --- a/controllers/utils.go +++ b/controllers/utils.go @@ -5,6 +5,7 @@ import ( "errors" "fmt" "os" + "reflect" "sort" "strings" @@ -86,6 +87,30 @@ func appendStringMap(src map[string]string, add map[string]string) map[string]st return res } +// In go, reflect.DeepEqual(a, b), on maps a and b, will return false even when len(a) == 0 and len(b) == 0. This is due to a nil map being considered different from an empty map. +// This function handles this case better: if maps both have a length of 0, we consider them always equal. +func areStringMapsEqual(one map[string]string, two map[string]string) bool { + + isOneEmpty := false + + isTwoEmpty := false + + if len(one) == 0 { + isOneEmpty = true + } + + if len(two) == 0 { + isTwoEmpty = true + } + + if isOneEmpty && isTwoEmpty { + return true + } + + return reflect.DeepEqual(one, two) + +} + // combineStringMaps will combine multiple maps: maps defined earlier in the 'maps' slice may have their values overriden by maps defined later in the 'maps' slice. func combineStringMaps(maps ...map[string]string) map[string]string { @@ -448,6 +473,11 @@ func removeUserLabelsAndAnnotations(obj *metav1.ObjectMeta, cr rolloutsmanagerv1 for objectLabelKey := range obj.Labels { + // We add this label to the aggregated cluster roles we generate, so we should not process it as a user label. + if strings.Contains(objectLabelKey, "rbac.authorization.k8s.io/") && strings.Contains(objectLabelKey, "aggregate") { + continue + } + existsInDefault := false for defaultLabelKey := range defaultLabelsAndAnnotations.Labels { diff --git a/controllers/utils_test.go b/controllers/utils_test.go index 225906f..1412ad4 100644 --- a/controllers/utils_test.go +++ b/controllers/utils_test.go @@ -452,7 +452,6 @@ var _ = Describe("removeUserLabelsAndAnnotations tests", func() { } Expect(k8sClient.Create(ctx, &cr)).To(Succeed()) - setRolloutsLabelsAndAnnotations(&obj) removeUserLabelsAndAnnotations(&obj, cr) @@ -460,34 +459,29 @@ var _ = Describe("removeUserLabelsAndAnnotations tests", func() { Expect(obj.Annotations).To(Equal(expectedAnnotations)) }, Entry("when no user-defined labels or annotations exist", - map[string]string{}, map[string]string{}, - map[string]string{"app.kubernetes.io/name": DefaultArgoRolloutsResourceName, - "app.kubernetes.io/part-of": DefaultArgoRolloutsResourceName, - "app.kubernetes.io/component": DefaultArgoRolloutsResourceName}, + map[string]string{}, + map[string]string{}, + map[string]string{}, map[string]string{}, ), Entry("when user-defined labels and annotations are present", - map[string]string{"user-label": "value"}, map[string]string{"user-annotation": "value"}, - map[string]string{"app.kubernetes.io/name": DefaultArgoRolloutsResourceName, - "app.kubernetes.io/part-of": DefaultArgoRolloutsResourceName, - "app.kubernetes.io/component": DefaultArgoRolloutsResourceName}, + map[string]string{"user-label": "value"}, + map[string]string{"user-annotation": "value"}, + map[string]string{}, map[string]string{}, ), Entry("when user-defined labels are present and annotations are empty", - map[string]string{"user-label": "value"}, map[string]string{}, - map[string]string{"app.kubernetes.io/name": DefaultArgoRolloutsResourceName, - "app.kubernetes.io/part-of": DefaultArgoRolloutsResourceName, - "app.kubernetes.io/component": DefaultArgoRolloutsResourceName}, + map[string]string{"user-label": "value"}, + map[string]string{}, + map[string]string{}, map[string]string{}, ), Entry("when user-defined labels are empty and annotations are present", - map[string]string{}, map[string]string{"user-annotation": "value"}, - map[string]string{"app.kubernetes.io/name": DefaultArgoRolloutsResourceName, - "app.kubernetes.io/part-of": DefaultArgoRolloutsResourceName, - "app.kubernetes.io/component": DefaultArgoRolloutsResourceName}, + map[string]string{}, + map[string]string{"user-annotation": "value"}, + map[string]string{}, map[string]string{}, ), - Entry("when both user and non-user-defined labels are present, and annotations are empty", map[string]string{"user-label": "value", "app.kubernetes.io/name": DefaultArgoRolloutsResourceName, @@ -499,6 +493,20 @@ var _ = Describe("removeUserLabelsAndAnnotations tests", func() { "app.kubernetes.io/component": DefaultArgoRolloutsResourceName}, map[string]string{}, ), + Entry("rbac aggregate-to- labels should not be removed, as we add these to ClusterRoles", + map[string]string{ + "rbac.authorization.k8s.io/aggregate-to-admin": "true", + "app.kubernetes.io/name": DefaultArgoRolloutsResourceName, + "app.kubernetes.io/part-of": DefaultArgoRolloutsResourceName, + "app.kubernetes.io/component": DefaultArgoRolloutsResourceName}, + map[string]string{}, + map[string]string{ + "rbac.authorization.k8s.io/aggregate-to-admin": "true", + "app.kubernetes.io/name": DefaultArgoRolloutsResourceName, + "app.kubernetes.io/part-of": DefaultArgoRolloutsResourceName, + "app.kubernetes.io/component": DefaultArgoRolloutsResourceName}, + map[string]string{}, + ), ) }) @@ -683,6 +691,23 @@ var _ = Describe("setAdditionalRolloutsLabelsAndAnnotationsToObject tests", func }) }) +var _ = Describe("areStringMapsEqual tests", func() { + DescribeTable("areStringMapsEqual should match reflect.DeepEqual(), except when comparing nil with empty map", func(one map[string]string, two map[string]string, expectedRes bool) { + + res := areStringMapsEqual(one, two) + Expect(res).To(Equal(expectedRes)) + }, + Entry("both nil", nil, nil, true), + Entry("one nil, one empty", map[string]string{}, nil, true), + Entry("one empty, one nil", nil, map[string]string{}, true), + Entry("both empty", map[string]string{}, map[string]string{}, true), + Entry("one empty, one has value", map[string]string{}, map[string]string{"key": "value"}, false), + Entry("one has value, one nil", map[string]string{"key": "value"}, nil, false), + Entry("equal values", map[string]string{"key": "value"}, map[string]string{"key": "value"}, true), + ) + +}) + var _ = Describe("envMerge tests", func() { DescribeTable("merges two slices of EnvVar entries", func(existing, merge, expected []corev1.EnvVar, override bool) { diff --git a/hack/run-rollouts-manager-e2e-tests.sh b/hack/run-rollouts-manager-e2e-tests.sh index eb5f48a..29f6845 100755 --- a/hack/run-rollouts-manager-e2e-tests.sh +++ b/hack/run-rollouts-manager-e2e-tests.sh @@ -5,6 +5,15 @@ SCRIPTPATH="$( pwd -P )" +function cleanup { + echo "* Cleaning up" + killall main || true + killall go || true +} + +trap cleanup EXIT + + # Treat undefined variables as errors set -u @@ -12,6 +21,15 @@ extract_metrics_data() { TMP_DIR=$(mktemp -d 2>/dev/null || mktemp -d -t 'mytmpdir') + while true; do + curl http://localhost:8080/metrics -o "$TMP_DIR/rollouts-metric-endpoint-output.txt" + if [ "$?" == "0" ]; then + break + fi + echo "* Waiting for Metrics endpoint to become available" + sleep 3 + done + # 1) Extract REST client get/put/post metrics # Example: the metrics from /metric endpoint look like this: @@ -20,9 +38,10 @@ extract_metrics_data() { # rest_client_requests_total{code="201",host="api.pgqqd-novoo-oqu.pa43.p3.openshiftapps.com:443",method="POST"} 110 curl http://localhost:8080/metrics -o "$TMP_DIR/rollouts-metric-endpoint-output.txt" - GET_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "GET" | rev | cut -d' ' -f1` - PUT_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "PUT" | rev | cut -d' ' -f1` - POST_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "POST" | rev | cut -d' ' -f1` + + GET_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "GET" | rev | cut -d' ' -f1 | rev` + PUT_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "PUT" | grep -v "409" | rev | cut -d' ' -f1 | rev` + POST_REQUESTS=`cat "$TMP_DIR/rollouts-metric-endpoint-output.txt" | grep "rest_client_requests_total" | grep "POST" | rev | cut -d' ' -f1 | rev` if [[ "$GET_REQUESTS" == "" ]]; then GET_REQUESTS=0 @@ -91,28 +110,6 @@ else fi -set +e - -# If the output from the E2E operator is available, then check it for errors -if [ -f "/tmp/e2e-operator-run.log" ]; then - - # Wait for the controller to flush to the file, before killing the controller - sleep 10 - killall main - sleep 5 - - # Grep the log for unexpected errors - # - Ignore errors that are expected to occur - - UNEXPECTED_ERRORS_FOUND_TEXT=`cat /tmp/e2e-operator-run.log | grep "ERROR" | grep -v "because it is being terminated" | grep -v "the object has been modified; please apply your changes to the latest version and try again" | grep -v "unable to fetch" | grep -v "StorageError"` | grep -v "client rate limiter Wait returned an error: context canceled" - UNEXPECTED_ERRORS_COUNT=`echo $UNEXPECTED_ERRORS_FOUND_TEXT | grep "ERROR" | wc -l` - - if [ "$UNEXPECTED_ERRORS_COUNT" != "0" ]; then - echo "Unexpected errors found: $UNEXPECTED_ERRORS_FOUND_TEXT" - exit 1 - fi -fi - # Sanity test the behaviour of the operator during the tests: # - We check the (prometheus) metrics coming from the 'localhost:8080/metrics' endpoint of the operator. # - For example, if the reported # of Reconcile calls was unusually high, this might mean that the operator was stuck in a Reconcile loop @@ -148,19 +145,19 @@ sanity_test_metrics_data() { if [[ "`expr $PUT_REQUEST_PERCENT \> 40`" == "1" ]]; then # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) - echo "Put request %$PUT_REQUEST_PERCENT was greater than the expected value" + echo "Put request was %$PUT_REQUEST_PERCENT greater than the expected value" exit 1 fi - if [[ "`expr $DELTA_ERROR_RECONCILES \> 20`" == "1" ]]; then + if [[ "`expr $DELTA_ERROR_RECONCILES \> 30`" == "1" ]]; then # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) echo "Number of Reconcile calls that returned an error $DELTA_ERROR_RECONCILES was greater than the expected value" exit 1 fi - if [[ "`expr $DELTA_SUCCESS_RECONCILES \> 200`" == "1" ]]; then + if [[ "`expr $DELTA_SUCCESS_RECONCILES \> 250`" == "1" ]]; then # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) echo "Number of Reconcile calls that returned success $DELTA_SUCCESS_RECONCILES was greater than the expected value" @@ -169,4 +166,26 @@ sanity_test_metrics_data() { fi } -sanity_test_metrics_data \ No newline at end of file +sanity_test_metrics_data + +set +e + +# If the output from the E2E operator is available, then check it for errors +if [ -f "/tmp/e2e-operator-run.log" ]; then + + # Wait for the controller to flush to the file, before killing the controller + sleep 10 + killall main + sleep 5 + + # Grep the log for unexpected errors + # - Ignore errors that are expected to occur + + UNEXPECTED_ERRORS_FOUND_TEXT=`cat /tmp/e2e-operator-run.log | grep "ERROR" | grep -v "because it is being terminated" | grep -v "the object has been modified; please apply your changes to the latest version and try again" | grep -v "unable to fetch" | grep -v "StorageError"` | grep -v "client rate limiter Wait returned an error: context canceled" + UNEXPECTED_ERRORS_COUNT=`echo $UNEXPECTED_ERRORS_FOUND_TEXT | grep "ERROR" | wc -l` + + if [ "$UNEXPECTED_ERRORS_COUNT" != "0" ]; then + echo "Unexpected errors found: $UNEXPECTED_ERRORS_FOUND_TEXT" + exit 1 + fi +fi From f399ca63a772860b09e126db1d572a89084a21a9 Mon Sep 17 00:00:00 2001 From: Jonathan West Date: Tue, 1 Oct 2024 14:34:18 -0400 Subject: [PATCH 4/5] Add timeout, fix Rollouts E2E script and Makefile bugs Signed-off-by: Jonathan West --- .github/workflows/e2e_tests.yml | 1 + .github/workflows/rollouts_e2e_tests.yml | 1 + Makefile | 4 ++-- hack/run-rollouts-manager-e2e-tests.sh | 20 ++++++++++++++------ 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/e2e_tests.yml b/.github/workflows/e2e_tests.yml index 3313bb3..b369d8c 100644 --- a/.github/workflows/e2e_tests.yml +++ b/.github/workflows/e2e_tests.yml @@ -12,6 +12,7 @@ jobs: test-e2e: name: Run end-to-end tests + timeout-minutes: 90 runs-on: ubuntu-latest strategy: matrix: diff --git a/.github/workflows/rollouts_e2e_tests.yml b/.github/workflows/rollouts_e2e_tests.yml index da552a6..c71200b 100644 --- a/.github/workflows/rollouts_e2e_tests.yml +++ b/.github/workflows/rollouts_e2e_tests.yml @@ -13,6 +13,7 @@ jobs: test-e2e: name: Run end-to-end tests from upstream Argo Rollouts repo runs-on: ubuntu-latest + timeout-minutes: 90 strategy: matrix: k3s-version: [ v1.28.2 ] diff --git a/Makefile b/Makefile index c4e5cf3..5afc515 100644 --- a/Makefile +++ b/Makefile @@ -136,7 +136,7 @@ start-e2e-namespace-scoped-bg: ## Start the operator, to run the e2e tests .PHONY: start-e2e-cluster-scoped-bg start-e2e-cluster-scoped-bg: ## Start the operator, to run the e2e tests - RUN_IN_BACKGROUND=true hack/start-rollouts-manager-for-e2e-tests.sh + RUN_IN_BACKGROUND=true NAMESPACE_SCOPED_ARGO_ROLLOUTS=false hack/start-rollouts-manager-for-e2e-tests.sh .PHONY: test-e2e-namespace-scoped @@ -145,7 +145,7 @@ test-e2e-namespace-scoped: ## Run operator e2e tests .PHONY: test-e2e-cluster-scoped test-e2e-cluster-scoped: ## Run operator e2e tests - hack/run-rollouts-manager-e2e-tests.sh + NAMESPACE_SCOPED_ARGO_ROLLOUTS=false hack/run-rollouts-manager-e2e-tests.sh .PHONY: start-test-e2e-all start-test-e2e-all: start-e2e-namespace-scoped-bg test-e2e-namespace-scoped start-e2e-cluster-scoped-bg test-e2e-cluster-scoped diff --git a/hack/run-rollouts-manager-e2e-tests.sh b/hack/run-rollouts-manager-e2e-tests.sh index 29f6845..154323d 100755 --- a/hack/run-rollouts-manager-e2e-tests.sh +++ b/hack/run-rollouts-manager-e2e-tests.sh @@ -157,7 +157,7 @@ sanity_test_metrics_data() { fi - if [[ "`expr $DELTA_SUCCESS_RECONCILES \> 250`" == "1" ]]; then + if [[ "`expr $DELTA_SUCCESS_RECONCILES \> 1200`" == "1" ]]; then # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) echo "Number of Reconcile calls that returned success $DELTA_SUCCESS_RECONCILES was greater than the expected value" @@ -181,11 +181,19 @@ if [ -f "/tmp/e2e-operator-run.log" ]; then # Grep the log for unexpected errors # - Ignore errors that are expected to occur - UNEXPECTED_ERRORS_FOUND_TEXT=`cat /tmp/e2e-operator-run.log | grep "ERROR" | grep -v "because it is being terminated" | grep -v "the object has been modified; please apply your changes to the latest version and try again" | grep -v "unable to fetch" | grep -v "StorageError"` | grep -v "client rate limiter Wait returned an error: context canceled" - UNEXPECTED_ERRORS_COUNT=`echo $UNEXPECTED_ERRORS_FOUND_TEXT | grep "ERROR" | wc -l` + set +u # allow undefined vars + + UNEXPECTED_ERRORS_FOUND_TEXT=`cat /tmp/e2e-operator-run.log | grep "ERROR" | grep -v "because it is being terminated" | grep -v "the object has been modified; please apply your changes to the latest version and try again" | grep -v "unable to fetch" | grep -v "StorageError" | grep -v "client rate limiter Wait returned an error: context canceled"` + + if [ "$UNEXPECTED_ERRORS_FOUND_TEXT" != "" ]; then - if [ "$UNEXPECTED_ERRORS_COUNT" != "0" ]; then - echo "Unexpected errors found: $UNEXPECTED_ERRORS_FOUND_TEXT" - exit 1 + UNEXPECTED_ERRORS_COUNT=`echo $UNEXPECTED_ERRORS_FOUND_TEXT | grep "ERROR" | wc -l` + + if [ "$UNEXPECTED_ERRORS_COUNT" != "0" ]; then + echo "Unexpected errors found: $UNEXPECTED_ERRORS_FOUND_TEXT" + exit 1 + fi fi + + fi From bf50c4124df38832a6829b9ebb1c6368ea4bf16a Mon Sep 17 00:00:00 2001 From: Jonathan West Date: Wed, 9 Oct 2024 12:20:52 -0400 Subject: [PATCH 5/5] Increase number of expected reconciles Signed-off-by: Jonathan West --- hack/run-rollouts-manager-e2e-tests.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hack/run-rollouts-manager-e2e-tests.sh b/hack/run-rollouts-manager-e2e-tests.sh index 154323d..7ac09c3 100755 --- a/hack/run-rollouts-manager-e2e-tests.sh +++ b/hack/run-rollouts-manager-e2e-tests.sh @@ -150,9 +150,9 @@ sanity_test_metrics_data() { fi - if [[ "`expr $DELTA_ERROR_RECONCILES \> 30`" == "1" ]]; then + if [[ "`expr $DELTA_ERROR_RECONCILES \> 60`" == "1" ]]; then # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) - echo "Number of Reconcile calls that returned an error $DELTA_ERROR_RECONCILES was greater than the expected value" + echo "Number of Reconcile calls that returned an error '$DELTA_ERROR_RECONCILES' was greater than the expected value" exit 1 fi @@ -160,7 +160,7 @@ sanity_test_metrics_data() { if [[ "`expr $DELTA_SUCCESS_RECONCILES \> 1200`" == "1" ]]; then # This value is arbitrary, and should be updated if at any point it becomes inaccurate (but first audit the test/code to make sure it is not an actual product/test issue, before increasing) - echo "Number of Reconcile calls that returned success $DELTA_SUCCESS_RECONCILES was greater than the expected value" + echo "Number of Reconcile calls that returned success '$DELTA_SUCCESS_RECONCILES' was greater than the expected value" exit 1 fi