Skip to content

Commit

Permalink
root user
Browse files Browse the repository at this point in the history
  • Loading branch information
alexboden committed Oct 27, 2024
1 parent a0092d8 commit afad831
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 14 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
main.py
31 changes: 17 additions & 14 deletions allocate-ephemeral-runner-with-stargz.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ record_timing() {
}

# Check if all required arguments are provided
if [ $# -lt 4 ]; then
if [ $# -lt 5 ]; then
log "ERROR: Missing required arguments"
log "Usage: $0 <repo-url> <registration-token> <removal-token> <label>"
log "Usage: $0 <repo-url> <registration-token> <removal-token> <label> <run_id>"
exit 1
fi

Expand Down Expand Up @@ -71,19 +71,21 @@ record_timing "Start stargz" $duration
export DOCKER_HOST=unix:///tmp/run/docker.sock

# Define the parent directory for GitHub Actions in the host machine
PARENT_DIR="/tmp/runner-${SLURMD_NODENAME}-${SLURM_JOB_ID}"
PROVISIONER_DIR="/mnt/wato-drive/alexboden/provisioner-cache/"
PARENT_DIR="/dev/shm/docker/runner-${SLURMD_NODENAME}-${SLURM_JOB_ID}"
PROVISIONER_DIR="/mnt/wato-drive2/alexboden/provisioner-cache/$RUN_ID"
log "INFO Parent directory for GitHub Actions: $PARENT_DIR"

start_time=$(date +%s)
GITHUB_ACTIONS_WKDIR="$PARENT_DIR/_work"
mkdir -p $PARENT_DIR
ls -l /mnt/wato-drive2/alexboden/provisioner-cache
mkdir -p $PROVISIONER_DIR
chown -R $(id -u):$(id -g) $PARENT_DIR
chmod -R 777 $PARENT_DIR
end_time=$(date +%s)
duration=$((end_time - start_time))
log "INFO Created and set permissions for parent directory (Duration: $duration seconds)"
record_timing "Create Parent Directory" $duration
log "INFO Created and set permissions for parent and provisioner directories (Duration: $duration seconds)"
record_timing "Create Directories" $duration

# Start the actions runner container
log "INFO Starting actions runner container"
Expand All @@ -98,12 +100,13 @@ record_timing "Start Container" $duration
log "INFO Configuring container"
start_time=$(date +%s)
./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "sudo chmod 666 /var/run/docker.sock" # Allows the runner to access the docker socket
./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "mkdir \"$PARENT_DIR\""
./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "mkdir -p \"$PARENT_DIR\""
./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "sudo chown -R runner:runner \"$PARENT_DIR\""
docker exec $DOCKER_CONTAINER_ID /bin/bash -c "sudo chmod -R 755 \"$PARENT_DIR\""
./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "sudo chmod -R 755 \"$PARENT_DIR\""

./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "sudo chown -R runner:runner \"$PROVISIONER_DIR\""
docker exec $DOCKER_CONTAINER_ID /bin/bash -c "sudo chmod -R 755 \"$PROVISIONER_DIR\""
# ./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "mkdir -p \"$PROVISIONER_DIR\""
# ./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "sudo chown -R root:root \"$PROVISIONER_DIR\""
./bin/nerdctl exec $DOCKER_CONTAINER_ID /bin/bash -c "sudo chmod -R 777 \"$PROVISIONER_DIR\""

end_time=$(date +%s)
duration=$((end_time - start_time))
Expand Down Expand Up @@ -138,8 +141,8 @@ record_timing "Remove Runner" $duration
# Clean up
log "INFO Stopping and removing Docker container"
start_time=$(date +%s)
docker stop $DOCKER_CONTAINER_ID
docker rm $DOCKER_CONTAINER_ID
./bin/nerdctl stop $DOCKER_CONTAINER_ID
./bin/nerdctl rm $DOCKER_CONTAINER_ID
end_time=$(date +%s)
duration=$((end_time - start_time))
log "INFO Docker container removed (Duration: $duration seconds)"
Expand All @@ -155,9 +158,9 @@ kill $SNAPSHOTTER_PID

wait $CONTAINERD_PID
wait $SNAPSHOTTER_PID
end_time=$(date +%s)

rootlesskit rm -rf /tmp/{run,config,containerd*,nerdctl}
end_time=$(date +%s)
duration=$((end_time - start_time))
log "INFO Containerd and Stargz cleaned up (Duration: $duration seconds)"

Expand All @@ -176,4 +179,4 @@ for duration in "${timings[@]}"; do
done
log "Total Time: $total_time seconds"

exit 0
exit 0
25 changes: 25 additions & 0 deletions apptainer-helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# Check if the correct number of arguments is provided
if [ "$#" -ne 7 ]; then
echo "Usage: $0 <GITHUB_ACTIONS_WKDIR> <REPO_URL> <REGISTRATION_TOKEN> <LABELS> <SLURMD_NODENAME> <SLURM_JOB_ID> <REMOVAL_TOKEN>"
exit 1
fi

# Assign input arguments to variables
GITHUB_ACTIONS_WKDIR=$1
REPO_URL=$2
REGISTRATION_TOKEN=$3
LABELS=$4
SLURMD_NODENAME=$5
SLURM_JOB_ID=$6
REMOVAL_TOKEN=$7

# Run the commands inside the Apptainer shell
export DOCKER_HOST=unix:///tmp/run/docker.sock
export RUNNER_ALLOW_RUNASROOT=1
export PYTHONPATH=/home/runner/.local/lib/python3.10/site-packages

/home/runner/config.sh --work "${GITHUB_ACTIONS_WKDIR}" --url "${REPO_URL}" --token "${REGISTRATION_TOKEN}" --labels "${LABELS}" --name "slurm-${SLURMD_NODENAME}-${SLURM_JOB_ID}" --unattended --ephemeral
/home/runner/run.sh
/home/runner/config.sh remove --token "${REMOVAL_TOKEN}"
4 changes: 4 additions & 0 deletions containerd-config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[proxy_plugins]
[proxy_plugins.stargz]
type = "snapshot"
address = "/tmp/run/containerd-stargz-grpc/containerd-stargz-grpc.sock"
4 changes: 4 additions & 0 deletions echo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
ls -l /mnt/wato-drive/alexboden
ls -l /mnt/wato-drive/alexboden/provisioner-cache
mkdir /mnt/wato-drive/alexboden/provisioner-cache/111312526769
54 changes: 54 additions & 0 deletions notes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Issues encountered
1. Docker in docker
2. Cloudflare 502 errors
3. Caching docker images


# 3
Does caching occur in the the machine?
1. Testing

# Testing basic
Average times for each task:
Register Runner: 4.02 seconds
Configure Container: 0.37 seconds
Run Runner: 128.38 seconds
Start Container: 69.29 seconds
Start Docker: 4.65 seconds
Remove Container: 12.67 seconds
Create Parent Directory: 0.02 seconds
Remove Runner: 0.44 seconds

Average times for each task:
Register Runner: 19.64 seconds
Configure Container: 3.00 seconds
Run Runner: 258.25 seconds
Start Container: 33.52 seconds
Start Docker: 10.39 seconds
Remove Container: 0.02 seconds
Create Parent Directory: 0.02 seconds
Remove Runner: 0.72 seconds


Received status code: 401 Unauthorized. Refreshing creds...

Options:
Apptainer:
- sif file in apptainer
- cvmfs with apptainer

stargz-snapshotter:
- stargz-snapshotter

Docker:
try docker load from wato-drive

look at the preprovision to see how to provision the image
debugging
- ./kuberenetes.sh k9s -A

make new uid for container and user.
base on slurm-dist, hard code version


- try caching on ram for client image
61 changes: 61 additions & 0 deletions poll_slurm_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Description: This script is used to poll the status of SLURM jobs that have been submitted by the user.
# It was used mainly for debugging purposes to ensure that the jobs were being submitted and running as expected.

import time

def check_slurm_status():
if not allocated_jobs:
return

to_remove = []
frozen_jobs = allocated_jobs.copy()
for job_id, runningjob in frozen_jobs.items():
if not runningjob or not runningjob.slurm_job_id:
continue

# Use 'sacct' to get job status and start time for a single job
sacct_cmd = ['sacct', '-n', '-P', '-o', 'JobID,State,Start,End', '--jobs', str(runningjob.slurm_job_id)]
try:
sacct_result = subprocess.run(sacct_cmd, capture_output=True, text=True)
sacct_output = sacct_result.stdout.strip()

if sacct_result.returncode != 0:
logging.error(f"sacct command failed with return code {sacct_result.returncode}")
logging.error(f"Error output: {sacct_result.stderr}")
continue

for line in sacct_output.split('\n'):
parts = line.split('|')
if line == '' or len(parts) < 4:
continue # Sometimes it takes a while for the job to appear in the sacct output

job_component = parts[0] # e.g., '3840.batch'
status = parts[1]
start_time_str = parts[2]
end_time_str = parts[3]

# Focus only on the main job ID and ignore '.batch' or '.extern' components
if '.batch' in job_component or '.extern' in job_component:
continue

# Convert time strings to datetime objects
start_time = datetime.strptime(start_time_str, '%Y-%m-%dT%H:%M:%S') if start_time_str != 'Unknown' else None
end_time = datetime.strptime(end_time_str, '%Y-%m-%dT%H:%M:%S') if end_time_str != 'Unknown' else None

if status in ['COMPLETED', 'FAILED', 'CANCELLED', 'TIMEOUT']: # otherwise job is not finished running
duration = "[Unknown Duration]"
if start_time and end_time:
duration = end_time - start_time
logging.info(f"Slurm job {job_component} {status} in {duration}. Running Job Info: {str(runningjob)}")
to_remove.append(job_id)

except Exception as e:
logging.error(f"Error querying SLURM job details for job ID {runningjob.slurm_job_id}: {e}")

for job_id in to_remove:
del allocated_jobs[job_id]

def poll_slurm_statuses(sleep_time=1):
while True:
check_slurm_status()
time.sleep(sleep_time)

0 comments on commit afad831

Please sign in to comment.