Skip to content

Commit

Permalink
initial docker image
Browse files Browse the repository at this point in the history
  • Loading branch information
alexboden committed Oct 16, 2024
1 parent 92f3442 commit 4d79d49
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 45 deletions.
14 changes: 14 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM python:3.9-slim

# Set the working directory in the container
WORKDIR /app

# Copy the Python script and any necessary files
COPY main.py config.py runner_size_config.py RunningJob.py allocate-ephemeral-runner-from-apptainer.sh /app/

# Install any required Python packages
COPY requirements.txt /app/
RUN pip install -r requirements.txt

# Run the Python script
CMD ["python", "main.py"]
28 changes: 20 additions & 8 deletions allocate-ephemeral-runner-from-apptainer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,13 @@ REMOVAL_TOKEN=$3
LABELS=$4
RUN_ID=$5

export DOCKER_HOST=unix:///tmp/run/docker.sock

# Define the parent directory for GitHub Actions in the host machine
PARENT_DIR="/tmp/runner-${SLURMD_NODENAME}-${SLURM_JOB_ID}"
PROVISIONER_DIR="/mnt/wato-drive2/alexboden/provisioner-cache/$RUN_ID"
# PROVISIONER_DIR="/mnt/wato-drive/alexboden/provisioner-cache/$RUN_ID"
log "INFO Parent directory for GitHub Actions: $PARENT_DIR"

start_time=$(date +%s)
mkdir -p $PROVISIONER_DIR
chmod -R 777 $PROVISIONER_DIR
# mkdir -p $PROVISIONER_DIR
# chmod -R 777 $PROVISIONER_DIR
GITHUB_ACTIONS_WKDIR="$PARENT_DIR/_work"
mkdir -p $PARENT_DIR $GITHUB_ACTIONS_WKDIR
chmod -R 777 $PARENT_DIR
Expand All @@ -39,6 +36,7 @@ log "INFO Created and set permissions for parent directory (Duration: $(($end_ti
log "INFO Starting Docker on Slurm"
start_time=$(date +%s)
slurm-start-dockerd.sh
export DOCKER_HOST=unix:///tmp/run/docker.sock
if [ $? -ne 0 ]; then
log "ERROR Docker failed to start (non-zero exit code)"
exit 1
Expand All @@ -52,11 +50,25 @@ source /cvmfs/soft.computecanada.ca/config/profile/bash.sh
module load apptainer

# Define the Docker image to use
DOCKER_IMAGE="/cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main"
export ACTIONS_RUNNER_IMAGE="/cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main"

log "INFO Starting Apptainer container and configuring runner"

apptainer exec --writable-tmpfs --containall --fakeroot --bind /tmp/run/docker.sock:/tmp/run/docker.sock --bind /home/alexboden:/home/alexboden --bind /tmp:/tmp /cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main /bin/bash -c "export DOCKER_HOST=unix:///tmp/run/docker.sock && export RUNNER_ALLOW_RUNASROOT=1 && export PYTHONPATH=/home/runner/.local/lib/python3.10/site-packages && /home/runner/config.sh --work \"${GITHUB_ACTIONS_WKDIR}\" --url \"${REPO_URL}\" --token \"${REGISTRATION_TOKEN}\" --labels \"${LABELS}\" --name \"slurm-${SLURMD_NODENAME}-${SLURM_JOB_ID}\" --unattended --ephemeral && /home/runner/run.sh && /home/runner/config.sh remove --token \"${REMOVAL_TOKEN}\""
# --compat is

# --containall

# --no-init

# --no-umask

# --writable-tmpfs

# --no-eval

apptainer exec --writable-tmpfs --containall --fakeroot --bind /tmp/run/docker.sock:/tmp/run/docker.sock --bind /cvmfs/cvmfs-ephemeral.cluster.watonomous.ca:/cvmfs/cvmfs-ephemeral.cluster.watonomous.ca --bind /home/alexboden:/home/alexboden --bind /tmp:/tmp /cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main /bin/bash -c "export DOCKER_HOST=unix:///tmp/run/docker.sock && export RUNNER_ALLOW_RUNASROOT=1 && export PYTHONPATH=/home/runner/.local/lib/python3.10/site-packages && /home/runner/config.sh --work \"${GITHUB_ACTIONS_WKDIR}\" --url \"${REPO_URL}\" --token \"${REGISTRATION_TOKEN}\" --labels \"${LABELS}\" --name \"slurm-${SLURMD_NODENAME}-${SLURM_JOB_ID}\" --unattended --ephemeral && /home/runner/run.sh && /home/runner/config.sh remove --token \"${REMOVAL_TOKEN}\""

# apptainer exec --compat --fakeroot --bind /tmp/run/docker.sock:/tmp/run/docker.sock --bind /cvmfs/cvmfs-ephemeral.cluster.watonomous.ca:/cvmfs/cvmfs-ephemeral.cluster.watonomous.ca --bind /home/alexboden:/home/alexboden --bind /tmp:/tmp --bind /cvmfs/soft.computecanada.ca/:/cvmfs/soft.computecanada.ca/ /cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main /bin/bash -c "source /cvmfs/soft.computecanada.ca/config/profile/bash.sh && export DOCKER_HOST=unix:///tmp/run/docker.sock && export RUNNER_ALLOW_RUNASROOT=1 && export PYTHONPATH=/home/runner/.local/lib/python3.10/site-packages && /home/runner/config.sh --work \"${GITHUB_ACTIONS_WKDIR}\" --url \"${REPO_URL}\" --token \"${REGISTRATION_TOKEN}\" --labels \"${LABELS}\" --name \"slurm-${SLURMD_NODENAME}-${SLURM_JOB_ID}\" --unattended --ephemeral && /home/runner/run.sh && /home/runner/config.sh remove --token \"${REMOVAL_TOKEN}\""

log "INFO Runner removed (Duration: $(($end_time - $start_time)) seconds)"

Expand Down
2 changes: 1 addition & 1 deletion config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
GITHUB_API_BASE_URL = 'https://api.github.com/repos/WATonomous/infra-config'
GITHUB_REPO_URL = 'https://github.com/WATonomous/infra-config'
LIST_OF_RUNNER_LABELS = ["gh-arc-runners-small", "gh-arc-runners-medium", "gh-arc-runners-large", "gh-arc-runners-xlarge"]
ALLOCATE_RUNNER_SCRIPT_PATH = "allocate-ephemeral-runner-from-docker.sh"
ALLOCATE_RUNNER_SCRIPT_PATH = "allocate-ephemeral-runner-from-apptainer.sh"
80 changes: 44 additions & 36 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,15 @@ def poll_github_actions_and_allocate_runners(url, token, sleep_time=5):
while True:
data, _ = get_gh_api(url, token, etag)
if data:
logging.info("Changes detected.")
allocate_runners_for_jobs(data, token)
logging.info("Polling for queued workflows...")
time.sleep(sleep_time) # issues occur if you request to frequently


def get_all_jobs(workflow_id, token):
"""
Get all CI jobs for a given workflow ID by iterating through the paginated API response.
"""
"""
Get all CI jobs for a given workflow ID by iterating through the paginated API response.
"""
all_jobs = []
page = 1
per_page = 100 # Maximum number of jobs per page according to rate limits
Expand All @@ -106,35 +105,35 @@ def allocate_runners_for_jobs(workflow_data, token):
return

number_of_queued_workflows = workflow_data["total_count"]
logging.info(f"Total number of queued workflows: {number_of_queued_workflows}")
# logging.info(f"Total number of queued workflows: {number_of_queued_workflows}")
number_of_queued_workflows = len(workflow_data["workflow_runs"])
logging.info(f"Number of workflow runs: {number_of_queued_workflows}")
# logging.info(f"Number of workflow runs: {number_of_queued_workflows}")

for i in range(number_of_queued_workflows):
workflow_id = workflow_data["workflow_runs"][i]["id"]
logging.info(f"Evaluating workflow ID: {workflow_id}")
# If statement to check if the workflow is on the testing branch, remove this for production
branch = workflow_data["workflow_runs"][i]["head_branch"]
if branch != "alexboden/test-slurm-gha-runner":
logging.info(f"Skipping workflow {workflow_id} because it is not on the correct branch, branch: {branch}.")
# logging.info(f"Evaluating workflow ID: {workflow_id}")
# If statement to check if the workflow is on the testing branch, remove this for production
branch = workflow_data["workflow_runs"][i]["head_branch"]
if branch != "alexboden/test-slurm-gha-runner" and branch != "alexboden/test-ci-apptainer":
# logging.info(f"Skipping workflow {workflow_id} because it is not on the correct branch, branch: {branch}.")
continue
else:
logging.info(f"Processing workflow {workflow_id} because it is on the correct branch, branch: {branch}.")
# else:
# logging.info(f"Processing workflow {workflow_id} because it is on the correct branch, branch: {branch}.")
job_data = get_all_jobs(workflow_id, token)
logging.info(f"There are {len(job_data)} jobs in the workflow.")
# logging.info(f"There are {len(job_data)} jobs in the workflow.")
for job in job_data:
if job["status"] == "queued":
queued_job_id = job["id"]
allocate_actions_runner(queued_job_id, token)
logging.info(f"Job {job['name']} {job['id']} is queued.")
# logging.info(f"Job {job['name']} {job['id']} is queued.")
# else:
# logging.info(f"Job {job['name']} {job['id']} is not queued.")

def allocate_actions_runner(job_id, token):
"""
Allocates a runner for the given job ID by sending a POST request to the GitHub API to get a registration token.
Proceeds to submit a SLURM job to allocate the runner with the corresponding resources.
"""
"""
Allocates a runner for the given job ID by sending a POST request to the GitHub API to get a registration token.
Proceeds to submit a SLURM job to allocate the runner with the corresponding resources.
"""
if job_id in allocated_jobs:
logging.info(f"Runner already allocated for job {job_id}")
return
Expand All @@ -158,29 +157,31 @@ def allocate_actions_runner(job_id, token):
data, _ = get_gh_api(f'{GITHUB_API_BASE_URL}/actions/jobs/{job_id}', token)
labels = data["labels"] # should only be one label in prod
logging.info(f"Job labels: {labels}")


run_id = data['run_id']

allocated_jobs[job_id] = RunningJob(job_id, None, data['workflow_name'], data['name'], labels)

if labels[0] != "alextest-gh-arc-runners-small" and labels[0] != "alextest-gh-arc-runners-medium" and labels[0] != "alextest-gh-arc-runners-large" and labels[0] != "alextest-gh-arc-runners-xlarge":
if labels[0] != "alexboden-gh-arc-runners-small" and labels[0] != "alexboden-gh-arc-runners-medium" and labels[0] != "alexboden-gh-arc-runners-large" and labels[0] != "alexboden-gh-arc-runners-xlarge":
logging.info(f"Skipping job because it is not for the correct runner. labels: {labels}, labels[0]: {labels[0]}")
del allocated_jobs[job_id]
return

runner_size_label = "gh-arc-runners-small" # default to small runner
if "alextest-gh-arc-runners-medium" in labels:
if "alexboden-gh-arc-runners-medium" in labels:
runner_size_label = "gh-arc-runners-medium"
elif "alextest-gh-arc-runners-large" in labels:
elif "alexboden-gh-arc-runners-large" in labels:
runner_size_label = "gh-arc-runners-large"
elif "alextest-gh-arc-runners-xlarge" in labels:
elif "alexboden-gh-arc-runners-xlarge" in labels:
runner_size_label = "gh-arc-runners-xlarge"

logging.info(f"Using runner size label: {runner_size_label}")
runner_resources = get_runner_resources(runner_size_label)

# sbatch resource allocation command
# sbatch resource allocation command
command = [
"sbatch",
# f"--nodelist=trpro-slurm1",
# f"--nodelist=thor-slurm1",
f"--job-name=slurm-{runner_size_label}-{job_id}",
f"--mem-per-cpu={runner_resources['mem-per-cpu']}",
f"--cpus-per-task={runner_resources['cpu']}",
Expand All @@ -190,7 +191,8 @@ def allocate_actions_runner(job_id, token):
GITHUB_REPO_URL,
registration_token,
removal_token,
','.join(labels)
','.join(labels),
str(run_id)
]

logging.info(f"Running command: {' '.join(command)}")
Expand Down Expand Up @@ -254,11 +256,17 @@ def check_slurm_status():
continue

# Convert time strings to datetime objects
start_time = datetime.strptime(start_time_str, '%Y-%m-%dT%H:%M:%S') if start_time_str != 'Unknown' else None
end_time = datetime.strptime(end_time_str, '%Y-%m-%dT%H:%M:%S') if end_time_str != 'Unknown' else None

if status in ['COMPLETED', 'FAILED', 'CANCELLED', 'TIMEOUT']: # otherwise job is not finished running
duration = "[Unknown Duration]"

if status.startswith('COMPLETED') or status.startswith('FAILED') or status.startswith('CANCELLED') or status.startswith('TIMEOUT'):
try:
start_time = datetime.strptime(start_time_str, '%Y-%m-%dT%H:%M:%S')
end_time = datetime.strptime(end_time_str, '%Y-%m-%dT%H:%M:%S')
except Exception as e:
logging.error(f"Error parsing start/end time for job {job_component}: {e}")
start_time = None
end_time = None
duration = "[Unknown Duration]"

if start_time and end_time:
duration = end_time - start_time
logging.info(f"Slurm job {job_component} {status} in {duration}. Running Job Info: {str(runningjob)}")
Expand All @@ -271,16 +279,16 @@ def check_slurm_status():
del allocated_jobs[job_id]

def poll_slurm_statuses(sleep_time=5):
"""
Wrapper function to poll check_slurm_status.
"""
"""
Wrapper function to poll check_slurm_status.
"""
while True:
check_slurm_status()
time.sleep(sleep_time)

if __name__ == "__main__":
# need to use threading to achieve simultaneous polling
github_thread = threading.Thread(target=poll_github_actions_and_allocate_runners, args=(queued_workflows_url, GITHUB_ACCESS_TOKEN))
github_thread = threading.Thread(target=poll_github_actions_and_allocate_runners, args=(queued_workflows_url, GITHUB_ACCESS_TOKEN, 2))
slurm_thread = threading.Thread(target=poll_slurm_statuses)

github_thread.start()
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests==2.25.1
python-dotenv===1.0.0

0 comments on commit 4d79d49

Please sign in to comment.