-
Notifications
You must be signed in to change notification settings - Fork 16
/
compute_build_base_img.yml
280 lines (231 loc) · 7.38 KB
/
compute_build_base_img.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
---
- hosts: localhost
vars:
compute_base_image: "Featured-RockyLinux8"
sec_group_global: "{{ ansible_facts.hostname }}-ssh-global"
sec_group_internal: "{{ ansible_facts.hostname }}-internal"
compute_base_size: "m3.tiny"
network_name: "{{ ansible_facts.hostname }}-elastic-net"
JS_ssh_keyname: "{{ ansible_facts.hostname }}-slurm-key"
openstack_cloud: "openstack"
vars_files:
- clouds.yaml
tasks:
- name: build compute base instance
os_server:
timeout: 300
state: present
name: "compute-{{ ansible_facts.hostname }}-base-instance"
cloud: "{{ openstack_cloud }}"
image: "{{ compute_base_image }}"
key_name: "{{ JS_ssh_keyname }}"
security_groups: "{{ sec_group_global }},{{ sec_group_internal }}"
flavor: "{{ compute_base_size }}"
meta: { compute: "base" }
auto_ip: "no"
user_data: |
#cloud-config
packages: []
package_update: false
package_upgrade: false
package_reboot_if_required: false
final_message: "Boot completed in $UPTIME seconds"
network: "{{ network_name }}"
wait: yes
register: "os_host"
- debug:
var: os_host
- name: add compute instance to inventory
add_host:
name: "{{ os_host['openstack']['name'] }}"
groups: "compute-base"
ansible_host: "{{ os_host.openstack.private_v4 }}"
- name: pause for ssh to come up
pause:
seconds: 90
- hosts: compute-base
vars:
compute_base_package_list:
- "python3-libselinux"
- "telnet"
- "bind-utils"
- "vim"
- "openmpi4-gnu9-ohpc"
- "ohpc-slurm-client"
- "lmod-ohpc"
- "ceph-common"
packages_to_remove:
- "environment-modules"
- "containerd.io.x86_64"
- "docker-ce.x86_64"
- "docker-ce-cli.x86_64"
- "docker-ce-rootless-extras.x86_64"
- "Lmod"
tasks:
- name: Get the headnode private IP
local_action:
module: shell source /etc/slurm/openrc.sh && openstack server show $(hostname -s) | grep addresses | awk -F'|' '{print $3}' | awk -F'=' '{print $2}' | awk -F',' '{print $1}'
register: headnode_private_ip
become: False # for running as slurm, since no sudo on localhost
- name: Get the slurmctld uid
local_action:
module: shell getent passwd slurm | awk -F':' '{print $3}'
register: headnode_slurm_uid
become: False # for running as slurm, since no sudo on localhost
- name: turn off the firewall
service:
name: firewalld
state: stopped
enabled: no
- name: Add OpenHPC 2.0 repo
dnf:
name: "http://repos.openhpc.community/OpenHPC/2/CentOS_8/x86_64/ohpc-release-2-1.el8.x86_64.rpm"
state: present
lock_timeout: 900
disable_gpg_check: yes
- name: Enable CentOS PowerTools repo
command: dnf config-manager --set-enabled powertools
- name: Disable docker-ce repo
command: dnf config-manager --set-disabled docker-ce-stable
- name: remove env-modules and docker packages
dnf:
name: "{{ packages_to_remove }}"
state: absent
lock_timeout: 300
# There is an issue in removing Lmod in early call. Seems like we need to run it twice
- name: remove Lmod packages
dnf:
name: Lmod
state: absent
lock_timeout: 300
- name: install basic packages
dnf:
name: "{{ compute_base_package_list }}"
state: present
lock_timeout: 300
- name: fix slurm user uid
user:
name: slurm
uid: "{{ headnode_slurm_uid.stdout}}"
shell: "/sbin/nologin"
home: "/etc/slurm"
- name: create slurm spool directories
file:
path: /var/spool/slurm/ctld
state: directory
owner: slurm
group: slurm
mode: 0755
recurse: yes
- name: change ownership of slurm files
file:
path: "{{ item }}"
owner: slurm
group: slurm
with_items:
- "/var/spool/slurm"
- "/var/spool/slurm/ctld"
# - "/var/log/slurm_jobacct.log"
- name: disable selinux
selinux: state=permissive policy=targeted
# - name: allow use_nfs_home_dirs
# seboolean: name=use_nfs_home_dirs state=yes persistent=yes
- name: import /home on compute nodes
lineinfile:
dest: /etc/fstab
line: "{{ headnode_private_ip.stdout }}:/home /home nfs defaults,nfsvers=4.0 0 0"
state: present
- name: ensure /opt/ohpc/pub exists
file: path=/opt/ohpc/pub state=directory mode=777 recurse=yes
- name: import /opt/ohpc/pub on compute nodes
lineinfile:
dest: /etc/fstab
line: "{{ headnode_private_ip.stdout }}:/opt/ohpc/pub /opt/ohpc/pub nfs defaults,nfsvers=4.0 0 0"
state: present
- name: ensure /export exists
file: path=/export state=directory mode=777
- name: import /export on compute nodes
lineinfile:
dest: /etc/fstab
line: "{{ headnode_private_ip.stdout }}:/export /export nfs defaults,nfsvers=4.0 0 0"
state: present
- name: fix sda1 mount in fstab
lineinfile:
dest: /etc/fstab
regex: "/ xfs defaults"
line: "/dev/sda1 / xfs defaults 0 0"
state: present
- name: add local users to compute node
script: /tmp/add_users.sh
ignore_errors: True
- name: copy munge key from headnode
synchronize:
mode: push
src: /etc/munge/munge.key
dest: /etc/munge/munge.key
set_remote_user: no
use_ssh_args: yes
- name: fix perms on munge key
file:
path: /etc/munge/munge.key
owner: munge
group: munge
mode: 0600
- name: copy slurm.conf from headnode
synchronize:
mode: push
src: /etc/slurm/slurm.conf
dest: /etc/slurm/slurm.conf
set_remote_user: no
use_ssh_args: yes
- name: copy slurm_prolog.sh from headnode
synchronize:
mode: push
src: /usr/local/sbin/slurm_prolog.sh
dest: /usr/local/sbin/slurm_prolog.sh
set_remote_user: no
use_ssh_args: yes
- name: enable munge
service: name=munge.service enabled=yes
- name: enable slurmd
service: name=slurmd enabled=yes
#cat /etc/systemd/system/multi-user.target.wants/slurmd.service
#[Unit]
#Description=Slurm node daemon
#After=network.target munge.service #CHANGING TO: network-online.target
#ConditionPathExists=/etc/slurm/slurm.conf
#
#[Service]
#Type=forking
#EnvironmentFile=-/etc/sysconfig/slurmd
#ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS
#ExecReload=/bin/kill -HUP $MAINPID
#PIDFile=/var/run/slurmd.pid
#KillMode=process
#LimitNOFILE=51200
#LimitMEMLOCK=infinity
#LimitSTACK=infinity
#Delegate=yes
#
#
#[Install]
#WantedBy=multi-user.target
- name: change slurmd service "After" to sshd and remote filesystems
command: sed -i 's/network.target/sshd.service remote-fs.target/' /usr/lib/systemd/system/slurmd.service
- name: add slurmd service "Requires" of sshd and remote filesystems
command: sed -i '/After=network/aRequires=sshd.service remote-fs.target' /usr/lib/systemd/system/slurmd.service
# - name: mount -a on compute nodes
# command: "mount -a"
- hosts: localhost
vars_files:
- clouds.yaml
tasks:
- name: create compute instance snapshot
command: ./compute_take_snapshot.sh
# os_server no longer handles instance state correctly
# - name: remove compute instance
# os_server:
# timeout: 200
# state: absent
# name: "compute-{{ inventory_hostname_short }}-base-instance"
# cloud: "tacc"