Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic experimental cache support #58

Merged
merged 9 commits into from
Sep 6, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cmd/sidecar_mounter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ import (
"os"
"os/signal"
"path/filepath"
"strings"
"sync"
"syscall"
"time"

sidecarmounter "github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/sidecar_mounter"
sidecarspec "github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/webhook"

"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/util"
"k8s.io/klog/v2"
)
Expand Down Expand Up @@ -189,5 +192,12 @@ func prepareMountConfig(sp string) (*sidecarmounter.MountConfig, error) {
return nil, fmt.Errorf("failed to fetch bucket name from CSI driver")
}

for _, opt := range mc.Options {
if strings.Contains(opt, "experimental-local-file-cache") {
judemars marked this conversation as resolved.
Show resolved Hide resolved

mc.TempDir = filepath.Join(sidecarspec.CacheVolumeMountPath, "gcsfuse-tmp")
judemars marked this conversation as resolved.
Show resolved Hide resolved
}
}

return &mc, nil
}
85 changes: 85 additions & 0 deletions examples/jupyter/jupyter-experimental-readcache.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Tensorflow/Jupyter StatefulSet
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: tensorflow
namespace: example
spec:
selector:
matchLabels:
pod: tensorflow-pod
serviceName: tensorflow
replicas: 1
template:
metadata:
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: 500m
gke-gcsfuse/memory-limit: 10Gi
gke-gcsfuse/ephemeral-storage-limit: 30Gi
labels:
pod: tensorflow-pod
spec:
serviceAccountName: gcsfuse-ksa
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-tesla-t4
terminationGracePeriodSeconds: 30
containers:
- name: tensorflow-container
securityContext:
privileged: true
image: tensorflow/tensorflow:2.13.0-gpu-jupyter
volumeMounts:
- name: tensorflow-pvc
mountPath: /tf/saved
resources:
limits:
nvidia.com/gpu: "1"
ephemeral-storage: 30Gi
memory: 10Gi
requests:
nvidia.com/gpu: "1"
ephemeral-storage: 30Gi
memory: 10Gi
env:
- name: JUPYTER_TOKEN
value: "jupyter"
volumes:
- name: tensorflow-pvc
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: <bucket-name> # unique bucket name
# update your experimental cache file options according to flags
# from
# https://github.com/GoogleCloudPlatform/gcsfuse/blob/19ed094b6612789b09ad4a1df3a2314099c65129/flags.go#L233C1-L236
mountOptions: "experimental-local-file-cache,stat-cache-ttl=240m0s,type-cache-ttl=240m0s,stat-cache-capacity=5000000000"

---
# Headless service for the above StatefulSet
apiVersion: v1
kind: Service
metadata:
name: tensorflow
namespace: example
spec:
ports:
- port: 8888
clusterIP: None
selector:
pod: tensorflow-pod
---
# External service
apiVersion: "v1"
kind: "Service"
metadata:
name: tensorflow-jupyter
namespace: example
spec:
ports:
- protocol: "TCP"
port: 80
targetPort: 8888
selector:
pod: tensorflow-pod
type: LoadBalancer
6 changes: 2 additions & 4 deletions pkg/cloud_provider/clientset/fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,9 @@ func (c *FakeClientset) GetPod(_ context.Context, namespace, name string) (*v1.P
},
Spec: v1.PodSpec{
Containers: []v1.Container{
webhook.GetSidecarContainerSpec(config),
},
Volumes: []v1.Volume{
webhook.GetSidecarContainerVolumeSpec(),
webhook.GetSidecarContainerSpec(config, false),
},
Volumes: webhook.GetSidecarContainerVolumeSpec(false),
},
}

Expand Down
21 changes: 19 additions & 2 deletions pkg/webhook/mutatingwebhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,25 @@ func (si *SidecarInjector) Handle(_ context.Context, req admission.Request) admi

klog.Infof("mutating Pod: Name %q, GenerateName %q, Namespace %q, CPU limit %q, memory limit %q, ephemeral storage limit %q", pod.Name, pod.GenerateName, pod.Namespace, configCopy.CPULimit.String(), configCopy.MemoryLimit.String(), configCopy.EphemeralStorageLimit.String())
// the gcsfuse sidecar container has to before the containers that consume the gcsfuse volume
pod.Spec.Containers = append([]corev1.Container{GetSidecarContainerSpec(configCopy)}, pod.Spec.Containers...)
pod.Spec.Volumes = append([]corev1.Volume{GetSidecarContainerVolumeSpec()}, pod.Spec.Volumes...)
useExperimentalLocalFileCache := false
hasCacheVolume := false
for _, v := range pod.Spec.Volumes {
if v.Name == CacheVolumeName {
hasCacheVolume = false
judemars marked this conversation as resolved.
Show resolved Hide resolved
}
if v.CSI == nil || v.CSI.VolumeAttributes == nil {
judemars marked this conversation as resolved.
Show resolved Hide resolved
continue
}
if val, ok := v.CSI.VolumeAttributes["mountOptions"]; ok {
if strings.Contains(val, "experimental-local-file-cache") {
useExperimentalLocalFileCache = true
}
}
}
// Add a volume for the experimental read cache if none already exists
addCacheVolume := useExperimentalLocalFileCache && !hasCacheVolume
pod.Spec.Containers = append([]corev1.Container{GetSidecarContainerSpec(configCopy, addCacheVolume)}, pod.Spec.Containers...)
pod.Spec.Volumes = append(GetSidecarContainerVolumeSpec(addCacheVolume), pod.Spec.Volumes...)
marshaledPod, err := json.Marshal(pod)
if err != nil {
return admission.Errored(http.StatusBadRequest, fmt.Errorf("failed to marshal pod: %w", err))
Expand Down
37 changes: 30 additions & 7 deletions pkg/webhook/sidecar_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,18 @@ const (
SidecarContainerName = "gke-gcsfuse-sidecar"
SidecarContainerVolumeName = "gke-gcsfuse-tmp"
SidecarContainerVolumeMountPath = "/gcsfuse-tmp"
CacheVolumeName = "cache-volume"
judemars marked this conversation as resolved.
Show resolved Hide resolved
CacheVolumeMountPath = "/cache"

// See the nonroot user discussion: https://github.com/GoogleContainerTools/distroless/issues/443
NobodyUID = 65534
NobodyGID = 65534
)

func GetSidecarContainerSpec(c *Config) v1.Container {
func GetSidecarContainerSpec(c *Config, addCacheVolume bool) v1.Container {
// The sidecar container follows Restricted Pod Security Standard,
// see https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
return v1.Container{
toReturn := v1.Container{
Name: SidecarContainerName,
Image: c.ContainerImage,
ImagePullPolicy: v1.PullPolicy(c.ImagePullPolicy),
Expand Down Expand Up @@ -74,15 +76,36 @@ func GetSidecarContainerSpec(c *Config) v1.Container {
},
},
}

if addCacheVolume {
toReturn.VolumeMounts = append(toReturn.VolumeMounts, v1.VolumeMount{
Name: CacheVolumeName,
MountPath: CacheVolumeMountPath,
})
}
return toReturn
}

func GetSidecarContainerVolumeSpec() v1.Volume {
return v1.Volume{
Name: SidecarContainerVolumeName,
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{},
func GetSidecarContainerVolumeSpec(addCacheVolume bool) []v1.Volume {
toReturn := []v1.Volume{
{
Name: SidecarContainerVolumeName,
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{},
},
},
}
if addCacheVolume {
toReturn = append(toReturn, v1.Volume{
Name: CacheVolumeName,
VolumeSource: v1.VolumeSource{
HostPath: &v1.HostPathVolumeSource{
Path: "/mnt/stateful_partition/kube-ephemeral-ssd",
judemars marked this conversation as resolved.
Show resolved Hide resolved
},
},
})
}
return toReturn
}

// ValidatePodHasSidecarContainerInjected validates the following:
Expand Down