diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..75e1fc9 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM python:3.9-slim + +# Set the working directory in the container +WORKDIR /app + +# Copy the Python script and any necessary files +COPY main.py config.py runner_size_config.py RunningJob.py allocate-ephemeral-runner-from-apptainer.sh /app/ + +# Install any required Python packages +COPY requirements.txt /app/ +RUN pip install -r requirements.txt + +# Run the Python script +CMD ["python", "main.py"] diff --git a/allocate-ephemeral-runner-from-apptainer.sh b/allocate-ephemeral-runner-from-apptainer.sh index 04768a6..52ddbfc 100644 --- a/allocate-ephemeral-runner-from-apptainer.sh +++ b/allocate-ephemeral-runner-from-apptainer.sh @@ -20,16 +20,13 @@ REMOVAL_TOKEN=$3 LABELS=$4 RUN_ID=$5 -export DOCKER_HOST=unix:///tmp/run/docker.sock - -# Define the parent directory for GitHub Actions in the host machine PARENT_DIR="/tmp/runner-${SLURMD_NODENAME}-${SLURM_JOB_ID}" -PROVISIONER_DIR="/mnt/wato-drive2/alexboden/provisioner-cache/$RUN_ID" +# PROVISIONER_DIR="/mnt/wato-drive/alexboden/provisioner-cache/$RUN_ID" log "INFO Parent directory for GitHub Actions: $PARENT_DIR" start_time=$(date +%s) -mkdir -p $PROVISIONER_DIR -chmod -R 777 $PROVISIONER_DIR +# mkdir -p $PROVISIONER_DIR +# chmod -R 777 $PROVISIONER_DIR GITHUB_ACTIONS_WKDIR="$PARENT_DIR/_work" mkdir -p $PARENT_DIR $GITHUB_ACTIONS_WKDIR chmod -R 777 $PARENT_DIR @@ -39,6 +36,7 @@ log "INFO Created and set permissions for parent directory (Duration: $(($end_ti log "INFO Starting Docker on Slurm" start_time=$(date +%s) slurm-start-dockerd.sh +export DOCKER_HOST=unix:///tmp/run/docker.sock if [ $? -ne 0 ]; then log "ERROR Docker failed to start (non-zero exit code)" exit 1 @@ -52,11 +50,25 @@ source /cvmfs/soft.computecanada.ca/config/profile/bash.sh module load apptainer # Define the Docker image to use -DOCKER_IMAGE="/cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main" +export ACTIONS_RUNNER_IMAGE="/cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main" log "INFO Starting Apptainer container and configuring runner" -apptainer exec --writable-tmpfs --containall --fakeroot --bind /tmp/run/docker.sock:/tmp/run/docker.sock --bind /home/alexboden:/home/alexboden --bind /tmp:/tmp /cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main /bin/bash -c "export DOCKER_HOST=unix:///tmp/run/docker.sock && export RUNNER_ALLOW_RUNASROOT=1 && export PYTHONPATH=/home/runner/.local/lib/python3.10/site-packages && /home/runner/config.sh --work \"${GITHUB_ACTIONS_WKDIR}\" --url \"${REPO_URL}\" --token \"${REGISTRATION_TOKEN}\" --labels \"${LABELS}\" --name \"slurm-${SLURMD_NODENAME}-${SLURM_JOB_ID}\" --unattended --ephemeral && /home/runner/run.sh && /home/runner/config.sh remove --token \"${REMOVAL_TOKEN}\"" +# --compat is + + # --containall + + # --no-init + + # --no-umask + + # --writable-tmpfs + + # --no-eval + +apptainer exec --writable-tmpfs --containall --fakeroot --bind /tmp/run/docker.sock:/tmp/run/docker.sock --bind /cvmfs/cvmfs-ephemeral.cluster.watonomous.ca:/cvmfs/cvmfs-ephemeral.cluster.watonomous.ca --bind /home/alexboden:/home/alexboden --bind /tmp:/tmp /cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main /bin/bash -c "export DOCKER_HOST=unix:///tmp/run/docker.sock && export RUNNER_ALLOW_RUNASROOT=1 && export PYTHONPATH=/home/runner/.local/lib/python3.10/site-packages && /home/runner/config.sh --work \"${GITHUB_ACTIONS_WKDIR}\" --url \"${REPO_URL}\" --token \"${REGISTRATION_TOKEN}\" --labels \"${LABELS}\" --name \"slurm-${SLURMD_NODENAME}-${SLURM_JOB_ID}\" --unattended --ephemeral && /home/runner/run.sh && /home/runner/config.sh remove --token \"${REMOVAL_TOKEN}\"" + +# apptainer exec --compat --fakeroot --bind /tmp/run/docker.sock:/tmp/run/docker.sock --bind /cvmfs/cvmfs-ephemeral.cluster.watonomous.ca:/cvmfs/cvmfs-ephemeral.cluster.watonomous.ca --bind /home/alexboden:/home/alexboden --bind /tmp:/tmp --bind /cvmfs/soft.computecanada.ca/:/cvmfs/soft.computecanada.ca/ /cvmfs/unpacked.cern.ch/ghcr.io/watonomous/actions-runner-image:main /bin/bash -c "source /cvmfs/soft.computecanada.ca/config/profile/bash.sh && export DOCKER_HOST=unix:///tmp/run/docker.sock && export RUNNER_ALLOW_RUNASROOT=1 && export PYTHONPATH=/home/runner/.local/lib/python3.10/site-packages && /home/runner/config.sh --work \"${GITHUB_ACTIONS_WKDIR}\" --url \"${REPO_URL}\" --token \"${REGISTRATION_TOKEN}\" --labels \"${LABELS}\" --name \"slurm-${SLURMD_NODENAME}-${SLURM_JOB_ID}\" --unattended --ephemeral && /home/runner/run.sh && /home/runner/config.sh remove --token \"${REMOVAL_TOKEN}\"" log "INFO Runner removed (Duration: $(($end_time - $start_time)) seconds)" diff --git a/config.py b/config.py index b355991..dab5571 100644 --- a/config.py +++ b/config.py @@ -1,4 +1,4 @@ GITHUB_API_BASE_URL = 'https://api.github.com/repos/WATonomous/infra-config' GITHUB_REPO_URL = 'https://github.com/WATonomous/infra-config' LIST_OF_RUNNER_LABELS = ["gh-arc-runners-small", "gh-arc-runners-medium", "gh-arc-runners-large", "gh-arc-runners-xlarge"] -ALLOCATE_RUNNER_SCRIPT_PATH = "allocate-ephemeral-runner-from-docker.sh" +ALLOCATE_RUNNER_SCRIPT_PATH = "allocate-ephemeral-runner-from-apptainer.sh" diff --git a/main.py b/main.py index bd01349..f5067dd 100644 --- a/main.py +++ b/main.py @@ -72,16 +72,15 @@ def poll_github_actions_and_allocate_runners(url, token, sleep_time=5): while True: data, _ = get_gh_api(url, token, etag) if data: - logging.info("Changes detected.") allocate_runners_for_jobs(data, token) logging.info("Polling for queued workflows...") time.sleep(sleep_time) # issues occur if you request to frequently def get_all_jobs(workflow_id, token): - """ - Get all CI jobs for a given workflow ID by iterating through the paginated API response. - """ + """ + Get all CI jobs for a given workflow ID by iterating through the paginated API response. + """ all_jobs = [] page = 1 per_page = 100 # Maximum number of jobs per page according to rate limits @@ -106,35 +105,35 @@ def allocate_runners_for_jobs(workflow_data, token): return number_of_queued_workflows = workflow_data["total_count"] - logging.info(f"Total number of queued workflows: {number_of_queued_workflows}") + # logging.info(f"Total number of queued workflows: {number_of_queued_workflows}") number_of_queued_workflows = len(workflow_data["workflow_runs"]) - logging.info(f"Number of workflow runs: {number_of_queued_workflows}") + # logging.info(f"Number of workflow runs: {number_of_queued_workflows}") for i in range(number_of_queued_workflows): workflow_id = workflow_data["workflow_runs"][i]["id"] - logging.info(f"Evaluating workflow ID: {workflow_id}") - # If statement to check if the workflow is on the testing branch, remove this for production - branch = workflow_data["workflow_runs"][i]["head_branch"] - if branch != "alexboden/test-slurm-gha-runner": - logging.info(f"Skipping workflow {workflow_id} because it is not on the correct branch, branch: {branch}.") + # logging.info(f"Evaluating workflow ID: {workflow_id}") + # If statement to check if the workflow is on the testing branch, remove this for production + branch = workflow_data["workflow_runs"][i]["head_branch"] + if branch != "alexboden/test-slurm-gha-runner" and branch != "alexboden/test-ci-apptainer": + # logging.info(f"Skipping workflow {workflow_id} because it is not on the correct branch, branch: {branch}.") continue - else: - logging.info(f"Processing workflow {workflow_id} because it is on the correct branch, branch: {branch}.") + # else: + # logging.info(f"Processing workflow {workflow_id} because it is on the correct branch, branch: {branch}.") job_data = get_all_jobs(workflow_id, token) - logging.info(f"There are {len(job_data)} jobs in the workflow.") + # logging.info(f"There are {len(job_data)} jobs in the workflow.") for job in job_data: if job["status"] == "queued": queued_job_id = job["id"] allocate_actions_runner(queued_job_id, token) - logging.info(f"Job {job['name']} {job['id']} is queued.") + # logging.info(f"Job {job['name']} {job['id']} is queued.") # else: # logging.info(f"Job {job['name']} {job['id']} is not queued.") def allocate_actions_runner(job_id, token): - """ - Allocates a runner for the given job ID by sending a POST request to the GitHub API to get a registration token. - Proceeds to submit a SLURM job to allocate the runner with the corresponding resources. - """ + """ + Allocates a runner for the given job ID by sending a POST request to the GitHub API to get a registration token. + Proceeds to submit a SLURM job to allocate the runner with the corresponding resources. + """ if job_id in allocated_jobs: logging.info(f"Runner already allocated for job {job_id}") return @@ -158,29 +157,31 @@ def allocate_actions_runner(job_id, token): data, _ = get_gh_api(f'{GITHUB_API_BASE_URL}/actions/jobs/{job_id}', token) labels = data["labels"] # should only be one label in prod logging.info(f"Job labels: {labels}") - + + run_id = data['run_id'] + allocated_jobs[job_id] = RunningJob(job_id, None, data['workflow_name'], data['name'], labels) - if labels[0] != "alextest-gh-arc-runners-small" and labels[0] != "alextest-gh-arc-runners-medium" and labels[0] != "alextest-gh-arc-runners-large" and labels[0] != "alextest-gh-arc-runners-xlarge": + if labels[0] != "alexboden-gh-arc-runners-small" and labels[0] != "alexboden-gh-arc-runners-medium" and labels[0] != "alexboden-gh-arc-runners-large" and labels[0] != "alexboden-gh-arc-runners-xlarge": logging.info(f"Skipping job because it is not for the correct runner. labels: {labels}, labels[0]: {labels[0]}") del allocated_jobs[job_id] return runner_size_label = "gh-arc-runners-small" # default to small runner - if "alextest-gh-arc-runners-medium" in labels: + if "alexboden-gh-arc-runners-medium" in labels: runner_size_label = "gh-arc-runners-medium" - elif "alextest-gh-arc-runners-large" in labels: + elif "alexboden-gh-arc-runners-large" in labels: runner_size_label = "gh-arc-runners-large" - elif "alextest-gh-arc-runners-xlarge" in labels: + elif "alexboden-gh-arc-runners-xlarge" in labels: runner_size_label = "gh-arc-runners-xlarge" logging.info(f"Using runner size label: {runner_size_label}") runner_resources = get_runner_resources(runner_size_label) - # sbatch resource allocation command + # sbatch resource allocation command command = [ "sbatch", - # f"--nodelist=trpro-slurm1", + # f"--nodelist=thor-slurm1", f"--job-name=slurm-{runner_size_label}-{job_id}", f"--mem-per-cpu={runner_resources['mem-per-cpu']}", f"--cpus-per-task={runner_resources['cpu']}", @@ -190,7 +191,8 @@ def allocate_actions_runner(job_id, token): GITHUB_REPO_URL, registration_token, removal_token, - ','.join(labels) + ','.join(labels), + str(run_id) ] logging.info(f"Running command: {' '.join(command)}") @@ -254,11 +256,17 @@ def check_slurm_status(): continue # Convert time strings to datetime objects - start_time = datetime.strptime(start_time_str, '%Y-%m-%dT%H:%M:%S') if start_time_str != 'Unknown' else None - end_time = datetime.strptime(end_time_str, '%Y-%m-%dT%H:%M:%S') if end_time_str != 'Unknown' else None - - if status in ['COMPLETED', 'FAILED', 'CANCELLED', 'TIMEOUT']: # otherwise job is not finished running - duration = "[Unknown Duration]" + + if status.startswith('COMPLETED') or status.startswith('FAILED') or status.startswith('CANCELLED') or status.startswith('TIMEOUT'): + try: + start_time = datetime.strptime(start_time_str, '%Y-%m-%dT%H:%M:%S') + end_time = datetime.strptime(end_time_str, '%Y-%m-%dT%H:%M:%S') + except Exception as e: + logging.error(f"Error parsing start/end time for job {job_component}: {e}") + start_time = None + end_time = None + duration = "[Unknown Duration]" + if start_time and end_time: duration = end_time - start_time logging.info(f"Slurm job {job_component} {status} in {duration}. Running Job Info: {str(runningjob)}") @@ -271,16 +279,16 @@ def check_slurm_status(): del allocated_jobs[job_id] def poll_slurm_statuses(sleep_time=5): - """ - Wrapper function to poll check_slurm_status. - """ + """ + Wrapper function to poll check_slurm_status. + """ while True: check_slurm_status() time.sleep(sleep_time) if __name__ == "__main__": # need to use threading to achieve simultaneous polling - github_thread = threading.Thread(target=poll_github_actions_and_allocate_runners, args=(queued_workflows_url, GITHUB_ACCESS_TOKEN)) + github_thread = threading.Thread(target=poll_github_actions_and_allocate_runners, args=(queued_workflows_url, GITHUB_ACCESS_TOKEN, 2)) slurm_thread = threading.Thread(target=poll_slurm_statuses) github_thread.start() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e8f8205 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests==2.25.1 +python-dotenv===1.0.0 \ No newline at end of file