Skip to content

Commit

Permalink
Merge pull request #298 from dholt/1906-updates
Browse files Browse the repository at this point in the history
19.06 updates
  • Loading branch information
michael-balint authored Jul 11, 2019
2 parents 08967c7 + 991280a commit 7b7c57f
Show file tree
Hide file tree
Showing 18 changed files with 148 additions and 302 deletions.
2 changes: 2 additions & 0 deletions ansible.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ deprecation_warnings = False
timeout=60
stdout_callback = yaml
bin_ansible_callbacks = True
local_tmp=/tmp
remote_tmp=/tmp

[ssh_connection]
pipelining = True
Expand Down
3 changes: 1 addition & 2 deletions config.example/group_vars/all.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,12 @@

# Slurm job scheduler configuration
#slurm_user_home: /local/slurm
#slurm_manage_gpus: no
#slurm_manage_gpus: true
#slurm_cluster_name: deepops
#slurm_username: slurm
#slurm_password: ReplaceWithASecurePasswordInTheVault
#slurm_db_username: slurm
#slurm_db_password: AlsoReplaceWithASecurePasswordInTheVault
#slurm_exclusive: "NO"
#slurm_max_job_timelimit: INFINITE
#slurm_default_job_timelimit:

Expand Down
39 changes: 39 additions & 0 deletions config.example/group_vars/k8s-cluster.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
ansible_become: true
kubeadm_enabled: true
kube_api_anonymous_auth: true
kubectl_localhost: true
kubeconfig_localhost: true
helm_enabled: true
tiller_node_selectors: "node-role.kubernetes.io/master=''"
docker_dns_servers_strict: no
docker_storage_options: -s overlay2

# Reset Flex Volume path to the default. Kubespray changes the path, which breaks Rook
# see: https://github.com/kubernetes/community/blob/master/contributors/devel/sig-storage/flexvolume.md
kubelet_flexvolumes_plugins_dir: /usr/libexec/kubernetes/kubelet-plugins/volume/exec

# NVIDIA Docker Configuration
# Setting reference: https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html
docker_daemon_json:
bip: 192.168.99.1/24
default-shm-size: 1G
default-ulimits:
memlock:
name: memlock
hard: -1
soft: -1
stack:
name: stack
hard: 67108864
soft: 67108864
default-runtime: nvidia
runtimes:
nvidia:
path: /usr/bin/nvidia-container-runtime
runtimeArgs: []

# When set to true, enables the PodSecurityPolicy admission controller and
# defines two policies: privileged (applying to all resources in kube-system
# namespace and kubelet) and restricted (applying all other namespaces).
# Addons deployed in kube-system namespaces are handled.
#podsecuritypolicy_enabled: false
2 changes: 1 addition & 1 deletion kubespray
8 changes: 5 additions & 3 deletions playbooks/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,16 @@
include_role:
name: docker
vars:
# matches: kubespray/roles/container-engine/docker/vars/ubuntu-bionic.yml
docker_package: 'docker-ce=18.06.1~ce~3-0~ubuntu'
# matches: kubespray/roles/container-engine/docker/vars/ubuntu-amd64.yml
#docker_package: 'docker-ce=18.06.1~ce~3-0~ubuntu'
docker_package: 'docker-ce=5:18.09.5~3-0~ubuntu-{{ ansible_distribution_release }}'
when: ansible_distribution == "Ubuntu"

- name: install docker
include_role:
name: docker
vars:
# matches: kubespray/roles/container-engine/docker/vars/redhat.yml
docker_package: 'docker-ce-18.06.2.ce-3.el7'
#docker_package: 'docker-ce-18.06.2.ce-3.el7'
docker_package: 'docker-ce-18.09.5-3.el7'
when: ansible_os_family == "RedHat"
58 changes: 16 additions & 42 deletions playbooks/k8s-cluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,8 @@
- bootstrap

# Install Kubernetes
# for configuration, see: config/group_vars/k8s-cluster.yml
- include: ../kubespray/cluster.yml
vars:
ansible_become: true
kubeadm_enabled: true
kube_api_anonymous_auth: true
kubectl_localhost: true
kubeconfig_localhost: true
helm_enabled: true
tiller_node_selectors: "node-role.kubernetes.io/master=''"
docker_dns_servers_strict: no
docker_storage_options: -s overlay2

# Disable swap (required for k8s), kubespray method doesn't quite cut it
- hosts: all
Expand All @@ -52,16 +43,6 @@
command: swapoff -a
tags:
- swap

# Create artifact of kube config
- hosts: kube-master
become: true
tasks:
- name: set kube config artifact
set_stats:
data:
kube_config: "{{ lookup('file', inventory_dir + '/artifacts/admin.conf') }}"
run_once: true

# Manage Kubernetes cluster access config file
- hosts: all
Expand All @@ -70,32 +51,37 @@
ansible_become: no
config_dir: "../config"
tasks:
- name: register config path
- name: register alternate config path
stat:
path: "../k8s-config"
register: k8s_config_dir
delegate_to: localhost
run_once: true
- set_fact:
config_dir: "../k8s-config"
when: k8s_config_dir.stat.exists
delegate_to: localhost
run_once: true
- name: create kube config directory for current user
file:
path: "{{ lookup('env','HOME') + '/.kube/' }}"
state: directory
delegate_to: localhost
run_once: true
- name: check for kube config file
stat:
path: "{{ config_dir }}/artifacts/admin.conf"
register: kubeconf
delegate_to: localhost
run_once: true
- name: copy kube config file for current user
copy:
src: "{{ config_dir }}/artifacts/admin.conf"
dest: "{{ lookup('env','HOME') + '/.kube/config' }}"
backup: yes
when: kubeconf.stat.exists
delegate_to: localhost
run_once: true
tags:
- local

Expand All @@ -104,26 +90,6 @@
tags:
- nvidia
- include: nvidia-docker.yml
vars:
# Docker Configuration
# Setting reference: https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html
docker_daemon_json:
bip: 192.168.99.1/24
default-shm-size: 1G
default-ulimits:
memlock:
name: memlock
hard: -1
soft: -1
stack:
name: stack
hard: 67108864
soft: 67108864
default-runtime: nvidia
runtimes:
nvidia:
path: /usr/bin/nvidia-container-runtime
runtimeArgs: []
tags:
- nvidia

Expand All @@ -137,26 +103,30 @@
ansible_become: no
config_dir: "../config"
tasks:
- name: register config path
- name: register alternate config path
stat:
path: "../k8s-config"
register: k8s_config_dir
delegate_to: localhost
run_once: true
- set_fact:
config_dir: "../k8s-config"
when: k8s_config_dir.stat.exists
delegate_to: localhost
run_once: true
- name: check for kubectl
stat:
path: "{{ config_dir }}/artifacts/kubectl"
register: kubectl_local
delegate_to: localhost
run_once: true
- name: modify kubectl permissions
file:
path: "{{ config_dir }}/artifacts/kubectl"
mode: '0755'
when: kubectl_local.stat.exists
delegate_to: localhost
run_once: true
- name: copy kubectl
copy:
src: "{{ config_dir }}/artifacts/kubectl"
Expand All @@ -166,11 +136,13 @@
ignore_errors: yes
register: kubectl_copied
delegate_to: localhost
run_once: true
- name: check for copied kubectl
stat:
path: "/usr/local/bin/kubectl"
register: kubectl_system
delegate_to: localhost
run_once: true
- name: modify kubectl permissions
file:
path: "/usr/local/bin/kubectl"
Expand All @@ -180,10 +152,12 @@
ignore_errors: yes
when: kubectl_system.stat.exists
delegate_to: localhost
run_once: true
- name: manually move kubectl binary
debug:
msg: "Unable to move kubectl, run: sudo cp {{ config_dir | realpath }}/artifacts/kubectl /usr/local/bin"
when: kubectl_copied is failed
delegate_to: localhost
run_once: true
tags:
- local
1 change: 1 addition & 0 deletions roles/k8s-gpu-plugin/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
state: present
with_items:
- "virtualenv"
- "python-setuptools"
when: ansible_distribution == 'Ubuntu'

- name: create location for python virtual env
Expand Down
17 changes: 8 additions & 9 deletions roles/slurm/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
epel_baseurl: "https://download.fedoraproject.org/pub/epel/$releasever/$basearch/"
epel_gpgkey: "https://epel.mirror.constant.com//RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"

slurm_xenial_deb_url: "https://github.com/lukeyeager/ubuntu-slurm/releases/download/docker-1.3/slurm_18.08.5-2-xenial_amd64.deb"
slurm_bionic_deb_url: "https://github.com/lukeyeager/ubuntu-slurm/releases/download/docker-1.3/slurm_18.08.5-2-bionic_amd64.deb"

slurm_pkg_url: "https://github.com/DeepOps/slurm/releases/download/"
slurm_xenial_deb: slurm_19.05.0-xenial_amd64.deb
slurm_bionic_deb: slurm_19.05.0-bionic_amd64.deb
slurm_rhel_rpm: slurm-19.05.0_7-1.x86_64.rpm

slurm_pkg_tag: docker-1.4
slurm_pkg_tag: v1.5

slurm_cluster_name: deepops
slurm_username: slurm
Expand All @@ -28,13 +28,12 @@ slurm_return_to_service: 1

# Sets: GresTypes=gpu
# Sets: Gres=gpu:{{ gpu_topology|count }}
# Default: no
slurm_manage_gpus: no
slurm_manage_gpus: true

# Controls the ability of the partition to execute more than one job at a time on each resource (node, socket or core depending upon the value of SelectTypeParameters). If resources are to be over-subscribed, avoiding memory over-subscription is very important. SelectTypeParameters should be configured to treat memory as a consumable resource and the --mem option should be used for job allocations. Sharing of resources is typically useful only when using gang scheduling (PreemptMode=suspend,gang). Possible values for OverSubscribe are "EXCLUSIVE", "FORCE", "YES", and "NO". Note that a value of "YES" or "FORCE" can negatively impact performance for systems with many thousands of running jobs. The default value is "NO". For more information see the following web pages:
# Sets: OverSubscribe=EXCLUSIVE
# Default: NO
slurm_exclusive: "EXCLUSIVE"
# Sets partition OverSubscribe
# To avoid sharing nodes, set to "EXCLUSIVE"
slurm_oversubscribe: "NO"

# Maximum run time limit for jobs. Format is minutes, minutes:seconds, hours:minutes:seconds, days-hours, days-hours:minutes, days-hours:minutes:seconds or "UNLIMITED". Time resolution is one minute and second values are rounded up to the next minute. This limit does not apply to jobs executed by SlurmUser or user root.
# Sets: MaxTime={{ slurm_max_job_timelimit }}
Expand Down
3 changes: 3 additions & 0 deletions roles/slurm/files/prolog-parts.d/50-all-docker
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#!/usr/bin/env bash
set -ex

# set path to find `usermod` on RHEL/CentOS
PATH=$PATH:/sbin:/usr/sbin

getent group docker || exit 0 # docker not installed

usermod -aG docker "$SLURM_JOB_USER"
1 change: 1 addition & 0 deletions roles/slurm/tasks/compute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
with_items:
- hwloc
- hwloc-devel
- psmisc
when: ansible_os_family == "RedHat"

- name: add cgroups to grub options
Expand Down
1 change: 0 additions & 1 deletion roles/slurm/tasks/controller.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@
failed_when: "create_cluster_result.rc != 0 and 'already exists' not in create_cluster_result.stdout"
changed_when: "'already exists' not in create_cluster_result.stdout"


- name: create account
command: sacctmgr -i add account compute-account Description="Compute Accounts" Organization="Prestige"
register: create_account_result
Expand Down
14 changes: 11 additions & 3 deletions roles/slurm/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
- name: install slurm (xenial)
when: ansible_facts['distribution'] == 'Ubuntu' and ansible_facts['distribution_major_version'] == '16'
apt:
deb: "{{ slurm_xenial_deb_url }}"
deb: "{{ slurm_pkg_url }}/{{ slurm_pkg_tag }}/{{ item }}"
with_items:
- "{{ slurm_xenial_deb }}"
notify:
- restart slurmd
- restart slurmdbd
Expand All @@ -23,7 +25,9 @@
- name: install slurm (bionic)
when: (ansible_facts['distribution'] == 'Ubuntu' and ansible_facts['distribution_major_version'] == '18')
apt:
deb: "{{ slurm_bionic_deb_url }}"
deb: "{{ slurm_pkg_url }}/{{ slurm_pkg_tag }}/{{ item }}"
with_items:
- "{{ slurm_bionic_deb }}"
notify:
- restart slurmd
- restart slurmdbd
Expand All @@ -35,7 +39,11 @@
name: "{{ slurm_pkg_url }}/{{ slurm_pkg_tag }}/{{ item }}"
state: present
with_items:
- slurm-18.08.5_2_7-1.x86_64.rpm
- "{{ slurm_rhel_rpm }}"
notify:
- restart slurmd
- restart slurmdbd
- restart slurmctld
when: ansible_os_family == "RedHat"
register: install_slurm_rhel

Expand Down
3 changes: 2 additions & 1 deletion roles/slurm/templates/slurm.conf
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,5 @@ GresTypes=gpu
# hardcoding the partitions and default memory per node
# TODO: automatically define the partitions by resource
# TODO: set DefMemPerCPU = TotalMemory / LogicalCPUs
PartitionName=batch Nodes=ALL Default=YES DefMemPerCPU=0 State=UP {% if slurm_exclusive is defined %} OverSubscribe={{ slurm_exclusive }} {% endif %} {%- if slurm_max_job_timelimit is defined %} MaxTime={{ slurm_max_job_timelimit }} {%- else %} MaxTime=INFINITE {% endif -%} {%- if slurm_default_job_timelimit is defined %} DefaultTime={{ slurm_default_job_timelimit }} {% endif -%}
{% set slurm_oversubscribe = (slurm_manage_gpus == true)|ternary('NO', 'EXCLUSIVE') %}
PartitionName=batch Nodes=ALL Default=YES DefMemPerCPU=0 State=UP OverSubscribe={{ slurm_oversubscribe }} {%- if slurm_max_job_timelimit is defined %} MaxTime={{ slurm_max_job_timelimit }} {%- else %} MaxTime=INFINITE {% endif -%} {%- if slurm_default_job_timelimit is defined %} DefaultTime={{ slurm_default_job_timelimit }} {% endif -%}
20 changes: 6 additions & 14 deletions scripts/k8s_deploy_rook.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,15 @@
# Upgrading:
# `helm update`
# `helm search rook` # get latest version number
# `helm upgrade --namespace rook-ceph-system rook-ceph rook-master/rook-ceph --version v0.9.0-174.g3b14e51`
# `helm upgrade --namespace rook-ceph rook-ceph rook-release/rook-ceph --version v0.9.0-174.g3b14e51`

set -x

HELM_ROOK_CHART_REPO="${HELM_ROOK_CHART_REPO:-https://charts.rook.io/master}"
HELM_ROOK_CHART_REPO="${HELM_ROOK_CHART_REPO:-https://charts.rook.io/release}"
HELM_ROOK_CHART_VERSION="${HELM_ROOK_CHART_VERSION:-v1.0.2}"

./scripts/install_helm.sh

helm repo list | grep rook-master >/dev/null 2>&1
if [ $? -ne 0 ] ; then
helm repo add rook-master "${HELM_ROOK_CHART_REPO}"
fi
# https://github.com/rook/rook/blob/master/Documentation/helm-operator.md
helm repo add rook-release "${HELM_ROOK_CHART_REPO}"

# Use an alternate image if set
helm_install_extra_flags=""
Expand All @@ -25,14 +22,9 @@ fi
# Install rook-ceph
helm status rook-ceph >/dev/null 2>&1
if [ $? -ne 0 ] ; then
helm install \
--namespace rook-ceph-system \
--name rook-ceph \
rook-master/rook-ceph \
--version v0.9.0-79.g1a1ffdd ${helm_install_extra_flags}
helm install --namespace rook-ceph --name rook-ceph rook-release/rook-ceph --version "${HELM_ROOK_CHART_VERSION}" ${helm_install_extra_flags}
fi


if kubectl -n rook-ceph get pod -l app=rook-ceph-tools 2>&1 | grep "No resources found." >/dev/null 2>&1; then
sleep 5
# If we have an alternate registry defined, dynamically substitute it in
Expand Down
Loading

0 comments on commit 7b7c57f

Please sign in to comment.