Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reliability/containerd cri runtime #2454

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions 10-eksclt.al2.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# eksctl-specific systemd drop-in unit for kubelet, for Amazon Linux 2 (AL2)

[Service]
# Local metadata parameters: REGION, AWS_DEFAULT_REGION
EnvironmentFile=/etc/eksctl/metadata.env
# Global and static parameters: CLUSTER_DNS, NODE_LABELS, NODE_TAINTS
EnvironmentFile=/etc/eksctl/kubelet.env
# Local non-static parameters: NODE_IP, INSTANCE_ID
EnvironmentFile=/etc/eksctl/kubelet.local.env

ExecStart=
ExecStart=/usr/bin/kubelet \
--node-ip=${NODE_IP} \
--node-labels=${NODE_LABELS},alpha.eksctl.io/instance-id=${INSTANCE_ID} \
--max-pods=${MAX_PODS} \
--register-node=true --register-with-taints=${NODE_TAINTS} \
--cloud-provider=aws \
--container-runtime=docker \
--network-plugin=cni \
--cni-bin-dir=/opt/cni/bin \
--cni-conf-dir=/etc/cni/net.d \
--pod-infra-container-image=${AWS_EKS_ECR_ACCOUNT}.dkr.ecr.${AWS_DEFAULT_REGION}.${AWS_SERVICES_DOMAIN}/eks/pause:3.3-eksbuild.1 \
--kubeconfig=/etc/eksctl/kubeconfig.yaml \
--config=/etc/eksctl/kubelet.yaml
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ kubectl:
@eval $$(python3 ./manager/cluster_config_env.py ./dev/config/cluster.yaml) && eksctl utils write-kubeconfig --cluster="$$CORTEX_CLUSTER_NAME" --region="$$CORTEX_REGION" | (grep -v "saved kubeconfig as" | grep -v "using region" | grep -v "eksctl version" || true)

cluster-up:
@$(MAKE) images-all
# @$(MAKE) images-all
@$(MAKE) cli
@kill $(shell pgrep -f rerun) >/dev/null 2>&1 || true
@eval $$(python3 ./manager/cluster_config_env.py ./dev/config/cluster.yaml) && ./bin/cortex cluster up ./dev/config/cluster.yaml --configure-env="$$CORTEX_CLUSTER_NAME"
Expand Down
133 changes: 133 additions & 0 deletions containerd.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
version = 2
root = "/var/lib/containerd"
state = "/run/containerd"
plugin_dir = ""
disabled_plugins = []
required_plugins = []
oom_score = 0

[grpc]
address = "/run/containerd/containerd.sock"
tcp_address = ""
tcp_tls_cert = ""
tcp_tls_key = ""
uid = 0
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216

[ttrpc]
address = ""
uid = 0
gid = 0

[debug]
address = ""
uid = 0
gid = 0
level = ""

[metrics]
address = ""
grpc_histogram = false

[cgroup]
path = ""

[timeouts]
"io.containerd.timeout.shim.cleanup" = "5s"
"io.containerd.timeout.shim.load" = "5s"
"io.containerd.timeout.shim.shutdown" = "3s"
"io.containerd.timeout.task.state" = "2s"

[plugins]
[plugins."io.containerd.gc.v1.scheduler"]
pause_threshold = 0.02
deletion_threshold = 0
mutation_threshold = 100
schedule_delay = "0s"
startup_delay = "100ms"
[plugins."io.containerd.grpc.v1.cri"]
disable_tcp_service = true
stream_server_address = "127.0.0.1"
stream_server_port = "0"
stream_idle_timeout = "4h0m0s"
enable_selinux = false
selinux_category_range = 1024
sandbox_image = "k8s.gcr.io/pause:3.2"
stats_collect_period = 10
systemd_cgroup = false
enable_tls_streaming = false
max_container_log_line_size = 16384
disable_cgroup = false
disable_apparmor = false
restrict_oom_score_adj = false
max_concurrent_downloads = 3
disable_proc_mount = false
unset_seccomp_profile = ""
tolerate_missing_hugetlb_controller = true
disable_hugetlb_controller = true
ignore_image_defined_volumes = false
[plugins."io.containerd.grpc.v1.cri".containerd]
snapshotter = "overlayfs"
default_runtime_name = "runc"
no_pivot = false
disable_snapshot_annotations = true
discard_unpacked_layers = false
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
runtime_type = ""
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
runtime_engine = ""
runtime_root = ""
privileged_without_host_devices = false
base_runtime_spec = ""
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
[plugins."io.containerd.grpc.v1.cri".cni]
bin_dir = "/opt/cni/bin"
conf_dir = "/etc/cni/net.d"
max_conf_num = 1
conf_template = ""
[plugins."io.containerd.grpc.v1.cri".registry]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors]
[plugins."io.containerd.grpc.v1.cri".registry.mirrors."docker.io"]
endpoint = ["https://registry-1.docker.io"]
[plugins."io.containerd.grpc.v1.cri".image_decryption]
key_model = ""
[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
tls_cert_file = ""
tls_key_file = ""
[plugins."io.containerd.internal.v1.opt"]
path = "/opt/containerd"
[plugins."io.containerd.internal.v1.restart"]
interval = "10s"
[plugins."io.containerd.metadata.v1.bolt"]
content_sharing_policy = "shared"
[plugins."io.containerd.monitor.v1.cgroups"]
no_prometheus = false
[plugins."io.containerd.runtime.v1.linux"]
shim = "containerd-shim"
runtime = "runc"
runtime_root = ""
no_shim = false
shim_debug = false
[plugins."io.containerd.runtime.v2.task"]
platforms = ["linux/amd64"]
[plugins."io.containerd.service.v1.diff-service"]
default = ["walking"]
[plugins."io.containerd.snapshotter.v1.devmapper"]
root_path = ""
pool_name = ""
base_image_size = ""
async_remove = false
2 changes: 1 addition & 1 deletion images/manager/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ RUN pip install --upgrade pip && \

RUN apk add --no-cache bash curl gettext jq openssl

RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.40.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && \
RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.44.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && \
mv /tmp/eksctl /usr/local/bin

RUN curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.18.9/2020-11-02/bin/linux/amd64/aws-iam-authenticator && \
Expand Down
6 changes: 5 additions & 1 deletion manager/generate_eks.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,13 @@ def default_nodegroup(cluster_config):
"evictionHard": {"memory.available": "200Mi", "nodefs.available": "5%"},
"registryPullQPS": 10,
},
"preBootstrapCommands": [
"yum install containerd -y",
"truncate -s-1 /etc/systemd/system/kubelet.service.d/10-eksclt.al2.conf",
"echo -n ' --container-runtime=remote --container-runtime-endpoint=unix:///run/containerd/containerd.sock' >> /etc/systemd/system/kubelet.service.d/10-eksclt.al2.conf",
],
}


def merge_override(a, b):
"merges b into a"
for key in b:
Expand Down