From 3aa924a655d56f1d6a51b8e7af0cd650a279d745 Mon Sep 17 00:00:00 2001
From: Jan Safranek <jsafrane@redhat.com>
Date: Thu, 1 Aug 2024 11:21:20 +0200
Subject: [PATCH] Add CSI test for LUN overflow

Add OpenShift specific test that creates > 256 PVs + Pods on a single node in
a large batch.

This makes sure that a CSI driver can support larger-ish number of separate
volumes per node. There was a case that a CSI driver created too high LUN
numbers (256) that was not supported by the Linux kernel.

All pods are created at the same time, expecting the CSI driver reports a
correct attach limit and the Kubernetes scheduler respects it.
---
 pkg/clioptions/clusterdiscovery/csi.go     |   2 +-
 pkg/testsuites/standard_suites.go          |  15 +-
 test/extended/storage/csi/README.md        |  85 +++++++++
 test/extended/storage/csi/csi.go           |   7 +-
 test/extended/storage/csi/scsi_overflow.go | 207 +++++++++++++++++++++
 5 files changed, 309 insertions(+), 7 deletions(-)
 create mode 100644 test/extended/storage/csi/README.md
 create mode 100644 test/extended/storage/csi/scsi_overflow.go

diff --git a/pkg/clioptions/clusterdiscovery/csi.go b/pkg/clioptions/clusterdiscovery/csi.go
index 970607abf3b0..0e84a8daa72a 100644
--- a/pkg/clioptions/clusterdiscovery/csi.go
+++ b/pkg/clioptions/clusterdiscovery/csi.go
@@ -15,7 +15,7 @@ import (
 
 const (
 	CSIManifestEnvVar = "TEST_CSI_DRIVER_FILES"
-	OCPManifestEnvVar = "TEST_OCP_DRIVER_FILES"
+	OCPManifestEnvVar = "TEST_OCP_CSI_DRIVER_FILES"
 )
 
 // Initialize openshift/csi suite, i.e. define CSI tests from TEST_CSI_DRIVER_FILES.
diff --git a/pkg/testsuites/standard_suites.go b/pkg/testsuites/standard_suites.go
index 5588e8e8bd35..ac037025dcfd 100644
--- a/pkg/testsuites/standard_suites.go
+++ b/pkg/testsuites/standard_suites.go
@@ -228,9 +228,18 @@ var staticSuites = []ginkgo.TestSuite{
 	{
 		Name: "openshift/csi",
 		Description: templates.LongDesc(`
-		Run tests for an CSI driver. Set the TEST_CSI_DRIVER_FILES environment variable to the name of file with
-		CSI driver test manifest. The manifest specifies Kubernetes + CSI features to test with the driver.
-		See https://github.com/kubernetes/kubernetes/blob/master/test/e2e/storage/external/README.md for required format of the file.
+		Run tests for an CSI driver. The CSI driver tests are configured by two yaml file manifests.
+		TEST_CSI_DRIVER_FILES environment variable must be a name of file with the upstream
+		CSI driver test manifest. See
+		https://github.com/openshift/kubernetes/blob/master/test/e2e/storage/external/README.md for
+		required format of the file. Replace "master" with the OpenShift version you are testing
+		against, e.g. "blob/release-4.17/test/..."
+		TEST_OCP_CSI_DRIVER_FILES environment is optional and when set, must be a name of file
+		with the OCP specific CSI driver test manifest. By specifying this file, the test suite will
+		run the OCP specific tests in addition to the upstream tests. See
+		https://github.com/openshift/origin/tree/master/test/extended/storage/csi for required format
+		of the file. Replace "master" with the OpenShift version you are testing against, e.g.
+		"blob/release-4.17/test/..."
 		`),
 		Matches: func(name string) bool {
 			if isDisabled(name) {
diff --git a/test/extended/storage/csi/README.md b/test/extended/storage/csi/README.md
new file mode 100644
index 000000000000..6d73169b883d
--- /dev/null
+++ b/test/extended/storage/csi/README.md
@@ -0,0 +1,85 @@
+# OpenShift CSI certification tests
+
+## Intro
+
+OpenShift `openshift/csi` test suite contains tests that exercise features of an already installed CSI driver. We re-use [upstream storage tests](https://github.com/openshift/kubernetes/blob/master/test/e2e/storage/external/README.md), including its YAML file manifest, and add a few OpenShift specific tests on top of it.
+
+Note: this documentation is not supported by Red Hat. It's here to help with debugging the tests or CSI driver. Follow the official Red Hat documentation to submit official CSI driver test results.
+
+## Manifests
+
+Two YAML files control what CSI driver features are tested and how. `openshift-tests` binary accepts two environment variables:
+
+* `TEST_CSI_DRIVER_FILES`: path to a file with **upstream** test manifest. See [upstream documentation](https://github.com/openshift/kubernetes/blob/master/test/e2e/storage/external/README.md) for full details. This env. variable is mandatory.
+* `TEST_OCP_CSI_DRIVER_FILES`: path to a file with **OpenShift specific** test manifest, see below for its format.
+
+### OpenShift specific manifest
+
+Example:
+
+```yaml
+Driver: <CSI driver name>
+LUNStressTest:
+  PodsTotal: 260
+  Timeout: "40m"
+```
+
+`LUNStressTest` is a test that stresses the CSI driver on a single node. The test picks a random scheudlable node and creates configured number of Pods + PVCs on it (260 by default).
+
+
+* Each Pod has its own PVC that needs to be dynamically provisioned by the CSI driver.
+* Each Pod does something very simple (like `ls /mnt/the_volume`) and exits quickly.
+* While all these Pods are created relativly quickly, the test *does not* expect for all Pods to run in parallel!
+  * We expect the CSI driver to return timeouts and other errors when it gets too many requests. OpenShift / CSI sidecars will retry with exponential backoff.
+  * Kubernetes should respect the CSI driver attach limit reported in CSINode, so only that amount of Pods can ever run in parallel.
+    * There is [a bug in Kubernetes](https://github.com/kubernetes/kubernetes/issues/126502) when the scheduler can put more Pods on a single node than the CSI driver supports. We expect the CSI driver to be robust and return a reasonable error to `ControllerPublish`, `NodeStage` or `NodePublish` when it's over the limit.
+* The timeout can be generous to allow enough time for dynamic provisioning, volume attach, mount, unmount, detach and PV deletion of 260 volumes.
+* No other test runs in parallel to this test, so the CSI driver can fully focus on this stress.
+
+* `PodsTotal`: how many Pods to create, 260 by default.
+* `Timeout`: how long to wait for these Pods to finish. Accepts [golang `ParseDuration` suffixes](https://pkg.go.dev/time#ParseDuration), such as `"1h30m15s"` for 1 hour, 30 minutes and 15 seconds.
+
+We strongly recommend to tests with 257 or more Pods and we suggest the test to finish in under 1 hour. There were cases where a CSI driver / RHCOS node configuration had issues with LUN numbers higher than 256. Even when a CSI driver does not use LUNs, it's a nice stress test that checks the CSI driver reports reasonable attach limit and can deal with some load.
+
+## Usage
+
+### With `openshift-tests` binary
+
+1. Either compile your own `openshift-tests` binary (run `make` in this repo) or extract it from an OpenShift image. **Always use the `openshift-tests` binary that corresponds to the OpenShift version that you have installed!**
+2. Set `KUBECONFIG` environment variable to point to your client configuration.
+3. Set `TEST_CSI_DRIVER_FILES` to upstream manifest.
+4. Optionally, set `TEST_OCP_CSI_DRIVER_FILES` to OpenShift test manifest.
+5. Run the test suite, `openshift-tests run openshift/csi`.
+
+Example:
+
+```shell
+export TEST_CSI_DRIVER_FILES=upstream-manifest.yaml # this is mandatory
+export TEST_OCP_CSI_DRIVER_FILES=ocp-manifest.yaml  # this is optional
+./openshift-tests run openshift/csi |& tee test.log
+```
+
+Tips:
+* `openshift-tests` runs a set of monitors *before* running any tests. They monitor the overall cluster health while the tests are running to make sure a test does not break the whole cluster. The monitors are *very* talkative and they create a lot of files in the current directory.
+* `openshift-tests run openshift/csi --dry-run` can be used to list tests that will run.
+* `openshift-tests run openshift/csi --run=<regexp>` can be used to run only specific tests. Optionally with `--dry-run` to fine tune the regexp. Use `--help` to get more command line options.
+* `openshift-tests run-test <full test name>` will run just a single test, without any monitors. There is (almost) no noise on the output and it is the best way to debug a single test. The `<full test name>` must be exactly the same as printed by `--dry-run`, including all spaces. Carefuly copy+paste a whole lile from `--dry-run` output, incl. double quotes. For example: `./openshift-tests run-test "External Storage [Driver: cooldriver.coolstorage.com] [Testpattern: Pre-provisioned PV (ext4)] volumes should store data"`.
+
+### With `tests` image from OpenShift release
+
+It's roughly equivalent to running `openshift-tests` binary as describe above, the binary is just in an container image.
+
+1. Prepare `kubeconfig.yaml`, upstream test manifest and optionally OpenShift test manifest in the current directory.
+2. Find the image with `openshift-tests` that corresponds to your OpenShift cluster version.
+    ```shell
+    $ oc adm release info --image-for=tests
+    quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:8e43b259635d5adcef769f5f4359554395c900d7211915249ee66b5602fea5b9
+    ```
+3. Run `openshift-tests` inside the `tests` container image. Make the current directory available as `/data` in the container and connect all the env. variables.
+    ```shell
+    podman run -v `pwd`:/data:z --rm -it quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:8e43b259635d5adcef769f5f4359554395c900d7211915249ee66b5602fea5b9 \
+        sh -c "KUBECONFIG=/data/kubeconfig.yaml TEST_CSI_DRIVER_FILES=/data/upstream-manifest.yaml TEST_OCP_CSI_DRIVER_FILES=/data/ocp-manifest.yaml /usr/bin/openshift-tests run openshift/csi --junit-dir /data/results”
+    ```
+
+Tips:
+* You can pass any command line parameters to `openshift-tests` as described above.
diff --git a/test/extended/storage/csi/csi.go b/test/extended/storage/csi/csi.go
index 23c056969f48..6b608d3051da 100644
--- a/test/extended/storage/csi/csi.go
+++ b/test/extended/storage/csi/csi.go
@@ -3,7 +3,6 @@ package csi
 import (
 	"fmt"
 	"os"
-	"time"
 
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/runtime/schema"
@@ -36,8 +35,10 @@ type OpenShiftCSIDriverConfig struct {
 type LUNStressTestConfig struct {
 	// How many Pods with one volume each to run in total. Set to 0 to disable the test.
 	PodsTotal int
-	// How long to wait for all Pods to start. 40 minutes by default.
-	Timeout time.Duration
+	// How long to wait for all Pods to start. Accepts the same suffixes as go
+	// time.Duration, e.g. "40m15s" for 40 minutes and 15 seconds. 40 minutes
+	// by default.
+	Timeout string
 }
 
 // runtime.DecodeInto needs a runtime.Object but doesn't do any
diff --git a/test/extended/storage/csi/scsi_overflow.go b/test/extended/storage/csi/scsi_overflow.go
new file mode 100644
index 000000000000..e3575a2eb4ef
--- /dev/null
+++ b/test/extended/storage/csi/scsi_overflow.go
@@ -0,0 +1,207 @@
+package csi
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	g "github.com/onsi/ginkgo/v2"
+	corev1 "k8s.io/api/core/v1"
+	storagev1 "k8s.io/api/storage/v1"
+	resource2 "k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/wait"
+	e2e "k8s.io/kubernetes/test/e2e/framework"
+	node2 "k8s.io/kubernetes/test/e2e/framework/node"
+	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
+	storageframework "k8s.io/kubernetes/test/e2e/storage/framework"
+	admissionapi "k8s.io/pod-security-admission/api"
+)
+
+func initSCSILUNOverflowCSISuite(cfg *LUNStressTestConfig) func() storageframework.TestSuite {
+	return func() storageframework.TestSuite {
+		return &scsiLUNOverflowCSISuite{
+			tsInfo: storageframework.TestSuiteInfo{
+				Name: "OpenShift CSI extended - SCSI LUN Overflow",
+				TestPatterns: []storageframework.TestPattern{
+					storageframework.FsVolModeDynamicPV,
+				},
+			},
+			lunStressTestConfig: cfg,
+		}
+	}
+}
+
+// scsiLUNOverflowCSISuite is a test suite for the LUN stress test.
+type scsiLUNOverflowCSISuite struct {
+	tsInfo              storageframework.TestSuiteInfo
+	lunStressTestConfig *LUNStressTestConfig
+}
+
+var _ storageframework.TestSuite = &scsiLUNOverflowCSISuite{}
+
+func (csiSuite *scsiLUNOverflowCSISuite) GetTestSuiteInfo() storageframework.TestSuiteInfo {
+	return csiSuite.tsInfo
+}
+
+func (csiSuite *scsiLUNOverflowCSISuite) SkipUnsupportedTests(driver storageframework.TestDriver, pattern storageframework.TestPattern) {
+	return
+}
+
+func (csiSuite *scsiLUNOverflowCSISuite) DefineTests(driver storageframework.TestDriver, pattern storageframework.TestPattern) {
+	f := e2e.NewFrameworkWithCustomTimeouts("storage-lun-overflow", storageframework.GetDriverTimeouts(driver))
+	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
+
+	// propagate the timeoutString from the test config to ginkgo.It("[Timeout:xyz]") to set test suite timeoutString
+	timeoutString := DefaultLUNStressTestTimeout
+	if csiSuite.lunStressTestConfig != nil && csiSuite.lunStressTestConfig.Timeout != "" {
+		timeoutString = csiSuite.lunStressTestConfig.Timeout
+	}
+	timeout, err := time.ParseDuration(timeoutString)
+	if err != nil {
+		panic(fmt.Sprintf("Cannot parse %s as time.Duration: %s", timeoutString, err))
+	}
+	testName := fmt.Sprintf("should use many PVs on a single node [Serial][Timeout:%s]", timeoutString)
+
+	g.It(testName, func(ctx context.Context) {
+		if csiSuite.lunStressTestConfig == nil {
+			g.Skip("lunStressTestConfig is empty")
+		}
+		if csiSuite.lunStressTestConfig.PodsTotal == 0 {
+			g.Skip("lunStressTestConfig is explicitly disabled")
+		}
+		e2e.Logf("Starting LUN stress test with config: %+v", csiSuite.lunStressTestConfig)
+		until := time.Now().Add(timeout)
+
+		g.By("Selecting a schedulable node")
+		node, err := node2.GetRandomReadySchedulableNode(ctx, f.ClientSet)
+		e2e.ExpectNoError(err, "getting a schedulable node")
+
+		g.By("Creating a StorageClass")
+		config := driver.PrepareTest(ctx, f)
+		sc, err := createSC(ctx, f, driver, config)
+		e2e.ExpectNoError(err, "creating StorageClass")
+		g.DeferCleanup(func(ctx context.Context) {
+			e2e.Logf("Cleaning up StorageClass %s", sc.Name)
+			err := f.ClientSet.StorageV1().StorageClasses().Delete(ctx, sc.Name, metav1.DeleteOptions{})
+			e2e.ExpectNoError(err, "deleting StorageClass", sc.Name)
+		})
+
+		podCount := csiSuite.lunStressTestConfig.PodsTotal
+		e2e.Logf("Starting %d pods", podCount)
+		for i := 0; i < podCount; i++ {
+			startTestPod(ctx, f, node.Name, config, sc.Name, i)
+		}
+		e2e.Logf("All pods created, waiting for them to start until %s", until.String())
+
+		// Some time was already spent when creating pods.
+		waitTimeout := until.Sub(time.Now())
+		err = waitForPodsComplete(ctx, f, podCount, waitTimeout)
+		e2e.ExpectNoError(err, "waiting for pods to complete")
+		e2e.Logf("All pods completed, cleaning up")
+	})
+}
+
+// Create one PVC + Pod. Do not wait for the pod to start!
+func startTestPod(ctx context.Context, f *e2e.Framework, nodeName string, config *storageframework.PerTestConfig, scName string, podNumber int) {
+	pvcName := fmt.Sprintf("pvc-%d", podNumber)
+
+	claimSize := config.Driver.GetDriverInfo().SupportedSizeRange.Min
+	if claimSize == "" {
+		claimSize = "1Gi"
+	}
+	claimQuantity, err := resource2.ParseQuantity(claimSize)
+	e2e.ExpectNoError(err, "parsing claim size %s", claimSize)
+
+	pvc := &corev1.PersistentVolumeClaim{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      pvcName,
+			Namespace: f.Namespace.Name,
+		},
+		Spec: corev1.PersistentVolumeClaimSpec{
+			StorageClassName: &scName,
+			AccessModes: []corev1.PersistentVolumeAccessMode{
+				corev1.ReadWriteOnce,
+			},
+			Resources: corev1.VolumeResourceRequirements{
+				Requests: corev1.ResourceList{
+					corev1.ResourceStorage: claimQuantity,
+				},
+			},
+		},
+	}
+	pvc, err = f.ClientSet.CoreV1().PersistentVolumeClaims(f.Namespace.Name).Create(ctx, pvc, metav1.CreateOptions{})
+	e2e.ExpectNoError(err, "creating PVC %s", pvcName)
+
+	g.DeferCleanup(func(ctx context.Context) {
+		err := f.ClientSet.CoreV1().PersistentVolumeClaims(f.Namespace.Name).Delete(ctx, pvc.Name, metav1.DeleteOptions{})
+		e2e.ExpectNoError(err, "deleting PVC %s", pvc.Name)
+	})
+
+	podConfig := &e2epod.Config{
+		NS:            f.Namespace.Name,
+		PVCs:          []*corev1.PersistentVolumeClaim{pvc},
+		NodeSelection: e2epod.NodeSelection{Name: nodeName},
+		Command:       "ls -la " + e2epod.VolumeMountPath1,
+	}
+	pod, err := e2epod.MakeSecPod(podConfig)
+	e2e.ExpectNoError(err, "preparing pod %d", podNumber)
+	// Make the pod name nicer to users, it has a random uuid otherwise
+	pod.Name = fmt.Sprintf("pod-%d", podNumber)
+
+	pod, err = f.ClientSet.CoreV1().Pods(f.Namespace.Name).Create(ctx, pod, metav1.CreateOptions{})
+	e2e.ExpectNoError(err, "creating pod %d", podNumber)
+	e2e.Logf("Pod %s + PVC %s created", pod.Name, pvc.Name)
+	g.DeferCleanup(func(ctx context.Context) {
+		err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Delete(ctx, pod.Name, metav1.DeleteOptions{})
+		e2e.ExpectNoError(err, "deleting pod %s", pod.Name)
+	})
+}
+
+func createSC(ctx context.Context, f *e2e.Framework, driver storageframework.TestDriver, config *storageframework.PerTestConfig) (*storagev1.StorageClass, error) {
+	pvTester, ok := driver.(storageframework.DynamicPVTestDriver)
+	if !ok {
+		return nil, fmt.Errorf("driver %s does not support dynamic provisioning", driver.GetDriverInfo().Name)
+	}
+
+	sc := pvTester.GetDynamicProvisionStorageClass(ctx, config, "")
+	_, err := f.ClientSet.StorageV1().StorageClasses().Create(ctx, sc, metav1.CreateOptions{})
+	return sc, err
+}
+
+func waitForPodsComplete(ctx context.Context, f *e2e.Framework, podCount int, timeout time.Duration) error {
+	var incomplete, complete []*corev1.Pod
+	err := wait.PollUntilContextTimeout(ctx, 10*time.Second, timeout, false, func(ctx context.Context) (done bool, err error) {
+		pods, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).List(ctx, metav1.ListOptions{})
+		if err != nil {
+			return false, fmt.Errorf("error listing pods: %w", err)
+		}
+		complete = nil
+		incomplete = nil
+
+		for _, pod := range pods.Items {
+			if pod.Status.Phase == corev1.PodSucceeded {
+				complete = append(complete, &pod)
+			} else {
+				incomplete = append(incomplete, &pod)
+			}
+		}
+
+		if len(complete) == podCount {
+			return true, nil
+		}
+		if len(complete)+len(incomplete) != podCount {
+			return false, fmt.Errorf("unexpected pod count: expected %d, got %d", len(complete)+len(incomplete), podCount)
+		}
+		e2e.Logf("Waiting for %d pods to complete, %d done", podCount, len(complete))
+		return false, nil
+	})
+
+	if err != nil {
+		e2e.Logf("Wait failed")
+		for i := range incomplete {
+			e2e.Logf("Incomplete pod %s: %s", incomplete[i].Name, incomplete[i].Status.Phase)
+		}
+	}
+	return err
+}