From de0bba602145185e362c8e6922b0dfaac4f85027 Mon Sep 17 00:00:00 2001 From: Rafael Sarmiento Date: Fri, 9 Aug 2024 17:22:02 +0200 Subject: [PATCH 1/4] split hostlist --- chart/f7t4jhub/files/jupyterhub-config.py | 1 + dockerfiles/Dockerfile | 4 ++-- firecrestspawner/spawner.py | 11 ++++++++--- requirements.txt | 1 + 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/chart/f7t4jhub/files/jupyterhub-config.py b/chart/f7t4jhub/files/jupyterhub-config.py index 0331ee1..b023d7c 100644 --- a/chart/f7t4jhub/files/jupyterhub-config.py +++ b/chart/f7t4jhub/files/jupyterhub-config.py @@ -140,6 +140,7 @@ async def refresh_user(self, user, handler=None): {% if memory %}#SBATCH --mem={{`{{memory}}`}}{% endif %} {% if gres %}#SBATCH --gres={{`{{gres}}`}}{% endif %} {% if nprocs %}#SBATCH --cpus-per-task={{`{{nprocs}}`}}{% endif %} +{% if nnodes %}#SBATCH --nodes={{`{{nnodes[0]}}`}}{% endif %} {% if reservation%}#SBATCH --reservation={{`{{reservation[0]}}`}}{% endif %} {% if constraint %}#SBATCH --constraint={{`{{constraint[0]}}`}}{% endif %} {% if options %}#SBATCH {{`{{options}}`}}{% endif %} diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index f185f68..a6540e2 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -20,7 +20,7 @@ RUN . /opt/conda/bin/activate && \ RUN . /opt/conda/bin/activate && \ conda activate $__CONDA_ENV__ && \ - pip install --no-cache jupyterhub==4.1.6 pyfirecrest==2.1.0 SQLAlchemy==1.4.52 oauthenticator==16.3.1 + pip install --no-cache jupyterhub==4.1.6 pyfirecrest==2.1.0 SQLAlchemy==1.4.52 oauthenticator==16.3.1 python-hostlist==1.23.0 COPY . firecrestspawner RUN . /opt/conda/bin/activate && \ @@ -68,4 +68,4 @@ USER juhu WORKDIR /home/juhu -CMD . /opt/conda/bin/activate && jupyterhub +CMD . /opt/conda/bin/activate && conda activate ${__CONDA_ENV__} && jupyterhub diff --git a/firecrestspawner/spawner.py b/firecrestspawner/spawner.py index a71c101..54e84b9 100644 --- a/firecrestspawner/spawner.py +++ b/firecrestspawner/spawner.py @@ -12,6 +12,7 @@ import re import sys import jupyterhub +import hostlist import firecrest as f7t from enum import Enum from jinja2 import Template @@ -236,6 +237,7 @@ async def submit_batch_script(self): self.job = await client.submit(self.host, script_str=script, env_vars=job_env) + self.log.debug(f"[client.submit] {self.job}") self.job_id = str(self.job['jobid']) self.log.info(f'Job {self.job_id} submitted') except Exception as e: @@ -259,8 +261,9 @@ async def query_job_status(self): client = await self.get_firecrest_client() self.log.debug('firecREST: Polling job') poll_result = await client.poll(self.host, [self.job_id]) + self.log.debug(f"[client.poll] [query_job_status] {poll_result}") state = poll_result[0]['state'] - host = poll_result[0]['nodelist'] + host = hostlist.expand_hostlist(poll_result[0]['nodelist'])[0] # `job_status` must keep the format used in the original # batchspawner since it will be later parsed with # regular expressions @@ -284,7 +287,8 @@ async def cancel_batch_job(self): self.log.info(f"Cancelling job {self.job_id}") client = await self.get_firecrest_client() self.log.info('firecREST: Canceling job') - await client.cancel(self.host, self.job_id) + cancel_result = await client.cancel(self.host, self.job_id) + self.log.debug(f"[client.cancel] {cancel_result}") def load_state(self, state): """load `job_id` from state""" @@ -498,9 +502,10 @@ async def state_gethost(self): client = await self.get_firecrest_client() poll_result = await client.poll(self.host, [self.job_id]) + self.log.debug(f"[client.poll] [state_gethost] {poll_result}") # FIXME: this is expecting `nodelist` to be only a single # node. Fix it so it can work with multiple nodes. - host = poll_result[0]['nodelist'] + host = hostlist.expand_hostlist(poll_result[0]['nodelist'])[0] return self.node_name_template.format(host) diff --git a/requirements.txt b/requirements.txt index 2d4494b..fe6b854 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ jupyterhub==4.1.6 pyfirecrest==2.1.0 SQLAlchemy==1.4.52 oauthenticator==16.3.1 +python-hostlist==1.23.0 From 2682571b52eb8403ec732e23d0c9d1e0c00d5721 Mon Sep 17 00:00:00 2001 From: Rafael Sarmiento Date: Fri, 9 Aug 2024 18:02:19 +0200 Subject: [PATCH 2/4] fix for the case of PENDING --- firecrestspawner/spawner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/firecrestspawner/spawner.py b/firecrestspawner/spawner.py index 54e84b9..e06d238 100644 --- a/firecrestspawner/spawner.py +++ b/firecrestspawner/spawner.py @@ -263,7 +263,9 @@ async def query_job_status(self): poll_result = await client.poll(self.host, [self.job_id]) self.log.debug(f"[client.poll] [query_job_status] {poll_result}") state = poll_result[0]['state'] - host = hostlist.expand_hostlist(poll_result[0]['nodelist'])[0] + nodelist = hostlist.expand_hostlist(poll_result[0]['nodelist']) + # when PENDING nodelist is [] + host = nodelist[0] if len(nodelist) > 0 else "" # `job_status` must keep the format used in the original # batchspawner since it will be later parsed with # regular expressions @@ -503,8 +505,6 @@ async def state_gethost(self): client = await self.get_firecrest_client() poll_result = await client.poll(self.host, [self.job_id]) self.log.debug(f"[client.poll] [state_gethost] {poll_result}") - # FIXME: this is expecting `nodelist` to be only a single - # node. Fix it so it can work with multiple nodes. host = hostlist.expand_hostlist(poll_result[0]['nodelist'])[0] return self.node_name_template.format(host) From ed8c147f9ec2c4a490c886d6467427b66e585e8b Mon Sep 17 00:00:00 2001 From: Rafael Sarmiento Date: Fri, 9 Aug 2024 18:22:41 +0200 Subject: [PATCH 3/4] update dockerfile --- dockerfiles/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index a6540e2..edf9aaa 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:22.04 +FROM ubuntu:24.10 ARG __CONDA_ENV__=py311 From 4be6db313f54badd8c75fa96990021be3d39e10a Mon Sep 17 00:00:00 2001 From: Rafael Sarmiento Date: Fri, 9 Aug 2024 18:35:15 +0200 Subject: [PATCH 4/4] update dockerfile --- dockerfiles/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index edf9aaa..7a3a70e 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:24.10 +FROM ubuntu:24.04 ARG __CONDA_ENV__=py311