forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 1
52 lines (49 loc) · 1.49 KB
/
node-reboot.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# # Regularly updates the CI container
# name: Reboots VMs in a controlled way
# on:
# schedule:
# - cron: 0 0,4,8,12,16,20 * * *
# workflow_dispatch:
# jobs:
# pre-flight:
# runs-on: ubuntu-latest
# outputs:
# list-of-vms: ${{ steps.main.outputs.main }}
# environment: main
# steps:
# - name: Get list of VMs
# id: main
# env:
# GITHUB_TOKEN: ${{ secrets.PAT }}
# run: |
# RUNNERS=$(curl -L \
# -H "Accept: application/vnd.github+json" \
# -H "Authorization: Bearer $GITHUB_TOKEN" \
# -H "X-GitHub-Api-Version: 2022-11-28" \
# https://api.github.com/repos/NVIDIA/NeMo/actions/runners)
# MATRIX=$(echo $RUNNERS \
# | jq -c '[
# .runners[]
# | select(.status == "online")
# | {
# "vm": .name,
# "n_gpus": [
# .labels[]
# | select(.name | endswith("gpu")) | .name
# ][0][:1]
# }
# ]
# '
# )
# echo main=$MATRIX | tee -a "$GITHUB_OUTPUT"
# maintenance:
# needs: pre-flight
# strategy:
# fail-fast: false
# matrix:
# include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
# uses: ./.github/workflows/node-reboot-single-vm.yml
# with:
# vm: ${{ matrix.vm }}
# n_gpus: ${{ matrix.n_gpus }}
# secrets: inherit