Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split host list when multiple nodes are allocated #29

Merged
merged 4 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions chart/f7t4jhub/files/jupyterhub-config.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ async def refresh_user(self, user, handler=None):
{% if memory %}#SBATCH --mem={{`{{memory}}`}}{% endif %}
{% if gres %}#SBATCH --gres={{`{{gres}}`}}{% endif %}
{% if nprocs %}#SBATCH --cpus-per-task={{`{{nprocs}}`}}{% endif %}
{% if nnodes %}#SBATCH --nodes={{`{{nnodes[0]}}`}}{% endif %}
{% if reservation%}#SBATCH --reservation={{`{{reservation[0]}}`}}{% endif %}
{% if constraint %}#SBATCH --constraint={{`{{constraint[0]}}`}}{% endif %}
{% if options %}#SBATCH {{`{{options}}`}}{% endif %}
Expand Down
6 changes: 3 additions & 3 deletions dockerfiles/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ubuntu:22.04
FROM ubuntu:24.04

ARG __CONDA_ENV__=py311

Expand All @@ -20,7 +20,7 @@ RUN . /opt/conda/bin/activate && \

RUN . /opt/conda/bin/activate && \
conda activate $__CONDA_ENV__ && \
pip install --no-cache jupyterhub==4.1.6 pyfirecrest==2.1.0 SQLAlchemy==1.4.52 oauthenticator==16.3.1
pip install --no-cache jupyterhub==4.1.6 pyfirecrest==2.1.0 SQLAlchemy==1.4.52 oauthenticator==16.3.1 python-hostlist==1.23.0

COPY . firecrestspawner
RUN . /opt/conda/bin/activate && \
Expand Down Expand Up @@ -68,4 +68,4 @@ USER juhu

WORKDIR /home/juhu

CMD . /opt/conda/bin/activate && jupyterhub
CMD . /opt/conda/bin/activate && conda activate ${__CONDA_ENV__} && jupyterhub
15 changes: 10 additions & 5 deletions firecrestspawner/spawner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import re
import sys
import jupyterhub
import hostlist
import firecrest as f7t
from enum import Enum
from jinja2 import Template
Expand Down Expand Up @@ -236,6 +237,7 @@ async def submit_batch_script(self):
self.job = await client.submit(self.host,
script_str=script,
env_vars=job_env)
self.log.debug(f"[client.submit] {self.job}")
self.job_id = str(self.job['jobid'])
self.log.info(f'Job {self.job_id} submitted')
except Exception as e:
Expand All @@ -259,8 +261,11 @@ async def query_job_status(self):
client = await self.get_firecrest_client()
self.log.debug('firecREST: Polling job')
poll_result = await client.poll(self.host, [self.job_id])
self.log.debug(f"[client.poll] [query_job_status] {poll_result}")
state = poll_result[0]['state']
host = poll_result[0]['nodelist']
nodelist = hostlist.expand_hostlist(poll_result[0]['nodelist'])
# when PENDING nodelist is []
host = nodelist[0] if len(nodelist) > 0 else ""
# `job_status` must keep the format used in the original
# batchspawner since it will be later parsed with
# regular expressions
Expand All @@ -284,7 +289,8 @@ async def cancel_batch_job(self):
self.log.info(f"Cancelling job {self.job_id}")
client = await self.get_firecrest_client()
self.log.info('firecREST: Canceling job')
await client.cancel(self.host, self.job_id)
cancel_result = await client.cancel(self.host, self.job_id)
self.log.debug(f"[client.cancel] {cancel_result}")

def load_state(self, state):
"""load `job_id` from state"""
Expand Down Expand Up @@ -498,9 +504,8 @@ async def state_gethost(self):

client = await self.get_firecrest_client()
poll_result = await client.poll(self.host, [self.job_id])
# FIXME: this is expecting `nodelist` to be only a single
# node. Fix it so it can work with multiple nodes.
host = poll_result[0]['nodelist']
self.log.debug(f"[client.poll] [state_gethost] {poll_result}")
host = hostlist.expand_hostlist(poll_result[0]['nodelist'])[0]
return self.node_name_template.format(host)


Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ jupyterhub==4.1.6
pyfirecrest==2.1.0
SQLAlchemy==1.4.52
oauthenticator==16.3.1
python-hostlist==1.23.0
Loading