From c8fb44e7bf262d823fe1a183edb28746afcf0eca Mon Sep 17 00:00:00 2001 From: German <28149841+germa89@users.noreply.github.com> Date: Tue, 27 Aug 2024 17:35:13 +0200 Subject: [PATCH] Supporting SLURM env vars for launching MAPDL configuration (#2754) * Adapting launcher to run on slurm * Working on nodes parser * Implemented machine argument * Update * Adding more debugging Adding memory option * Merge branch 'main' into feat/supporting-slurm-manager * Avoiding checking number of processors in slrum * Cleaning empty args * renaming argument * Update env vars to check to decide if ON_SLURM or not. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Removing unneeded * Adding comment * removing redundancies * Removing self * Removing unused arg * Update src/ansys/mapdl/core/launcher.py Co-authored-by: Maxime Rey <87315832+MaxJPRey@users.noreply.github.com> * fixing memory units * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update doc/styles/Vocab/ANSYS/accept.txt * Apply suggestions from code review Co-authored-by: Kathy Pippert <84872299+PipKat@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Kathy Pippert <84872299+PipKat@users.noreply.github.com> * Adding tests * Moving fixtures to main file * testing exec_file * fix tests * Adding env var documentation * Small refactoring regarding env var processing Adding typing * Better env var order * chore: adding changelog file 2754.documentation.md * Apply suggestions from code review Co-authored-by: Kathy Pippert <84872299+PipKat@users.noreply.github.com> * fix: table format --------- Co-authored-by: German Martinez Ayuso Co-authored-by: german Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Maxime Rey <87315832+MaxJPRey@users.noreply.github.com> Co-authored-by: Kathy Pippert <84872299+PipKat@users.noreply.github.com> Co-authored-by: pyansys-ci-bot <92810346+pyansys-ci-bot@users.noreply.github.com> --- doc/changelog.d/2754.documentation.md | 1 + doc/source/user_guide/mapdl.rst | 3 + .../config/vocabularies/ANSYS/accept.txt | 1 - src/ansys/mapdl/core/launcher.py | 271 +++++++++++++++--- tests/conftest.py | 25 ++ tests/test_launcher.py | 150 ++++++++++ 6 files changed, 417 insertions(+), 34 deletions(-) create mode 100644 doc/changelog.d/2754.documentation.md diff --git a/doc/changelog.d/2754.documentation.md b/doc/changelog.d/2754.documentation.md new file mode 100644 index 0000000000..0004117961 --- /dev/null +++ b/doc/changelog.d/2754.documentation.md @@ -0,0 +1 @@ +feat: Supporting SLURM env vars for launching MAPDL configuration \ No newline at end of file diff --git a/doc/source/user_guide/mapdl.rst b/doc/source/user_guide/mapdl.rst index 6885e9252c..bfc59931b5 100644 --- a/doc/source/user_guide/mapdl.rst +++ b/doc/source/user_guide/mapdl.rst @@ -1167,6 +1167,9 @@ These are described in the following table: | | export PYMAPDL_MAPDL_VERSION=22.2 | | | | +---------------------------------------+---------------------------------------------------------------------+ +| :envvar:`PYMAPDL_ON_SLURM` | With this environment variable set to ``FALSE``, you can avoid | +| | PyMAPDL from detecting that it is running on a SLURM HPC cluster. | ++---------------------------------------+---------------------------------------------------------------------+ | :envvar:`PYMAPDL_MAX_MESSAGE_LENGTH` | Maximum gRPC message length. If your | | | connection terminates when running | | | PRNSOL or NLIST, raise this. In bytes, | diff --git a/doc/styles/config/vocabularies/ANSYS/accept.txt b/doc/styles/config/vocabularies/ANSYS/accept.txt index 1d468cf83a..19629c86b6 100644 --- a/doc/styles/config/vocabularies/ANSYS/accept.txt +++ b/doc/styles/config/vocabularies/ANSYS/accept.txt @@ -99,7 +99,6 @@ Linder Linux MacOS mapdl -mapdl MAPDL mater MATLAB diff --git a/src/ansys/mapdl/core/launcher.py b/src/ansys/mapdl/core/launcher.py index ff15fbc138..51f98937b8 100644 --- a/src/ansys/mapdl/core/launcher.py +++ b/src/ansys/mapdl/core/launcher.py @@ -32,7 +32,7 @@ import tempfile import threading import time -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import warnings import psutil @@ -309,11 +309,12 @@ def launch_grpc( Number of processors. Defaults to 2. ram : float, optional - Fixed amount of memory to request for MAPDL. If ``None``, - then MAPDL will use as much as available on the host machine. + Total size in megabytes of the workspace (memory) used for the initial allocation. + The default is ``None``, in which case 2 GB (2048 MB) is used. To force a fixed size + throughout the run, specify a negative number. run_location : str, optional - MAPDL working directory. Defaults to a temporary working + MAPDL working directory. The default is the temporary working directory. port : int @@ -525,6 +526,9 @@ def launch_grpc( pymapdl._LOCAL_PORTS.append(port) + if not nproc: + nproc = 2 + cpu_sw = "-np %d" % nproc if ram: @@ -576,22 +580,22 @@ def launch_grpc( port_sw, grpc_sw, ] - command = " ".join(command_parm) else: # linux - command_parm = [] - command_parm.extend( - [ - '"%s"' % exec_file, - job_sw, - cpu_sw, - ram_sw, - additional_switches, - port_sw, - grpc_sw, - ] - ) - command = " ".join(command_parm) + command_parm = [ + '"%s"' % exec_file, + job_sw, + cpu_sw, + ram_sw, + additional_switches, + port_sw, + grpc_sw, + ] + + command_parm = [ + each for each in command_parm if command_parm + ] # cleaning empty args. + command = " ".join(command_parm) LOG.debug(f"Starting MAPDL with command: {command}") @@ -1085,7 +1089,8 @@ def launch_mapdl( add_env_vars: Optional[Dict[str, str]] = None, replace_env_vars: Optional[Dict[str, str]] = None, version: Optional[Union[int, str]] = None, - **kwargs, + detect_slurm_config: bool = True, + **kwargs: Dict[str, Any], ) -> Union[MapdlGrpc, "MapdlConsole"]: """Start MAPDL locally. @@ -1116,8 +1121,9 @@ def launch_mapdl( Number of processors. Defaults to 2. ram : float, optional - Fixed amount of memory to request for MAPDL. If ``None``, - then MAPDL will use as much as available on the host machine. + Total size in megabytes of the workspace (memory) used for the initial allocation. + The default is ``None``, in which case 2 GB (2048 MB) is used. To force a fixed size + throughout the run, specify a negative number. mode : str, optional Mode to launch MAPDL. Must be one of the following: @@ -1441,9 +1447,43 @@ def launch_mapdl( "ANSYSLMD_LICENSE_FILE":"1055@MYSERVER"} >>> mapdl = launch_mapdl(replace_env_vars=my_env_vars) """ + # By default + ON_SLURM = os.environ.get("PYMAPDL_ON_SLURM", None) + if ON_SLURM is None: + ON_SLURM = True + else: + # Unless the env var is false, it will be true. + ON_SLURM = not (ON_SLURM.lower() == "false") + + # Let's require the following env vars to exist to go into slurm mode. + ON_SLURM = ( + ON_SLURM + and bool(os.environ.get("SLURM_JOB_NAME", "")) + and bool(os.environ.get("SLURM_JOB_ID", "")) + ) + + if detect_slurm_config and ON_SLURM: + LOG.info("On Slurm mode.") + + # extracting parameters + exec_file, jobname, nproc, ram, additional_switches = _parse_slurm_options( + exec_file, + jobname, + nproc, + ram, + additional_switches, + **kwargs, + ) + # To avoid timeouts + license_server_check = False + start_timeout = 2 * start_timeout + ON_SLURM = True # Using this as main variable + else: + ON_SLURM = False + if remove_temp_files is not None: warnings.warn( - "The option ``remove_temp_files`` is being deprecated and it will be removed by PyMAPDL version 0.66.0.\n" + "The ``remove_temp_files`` option is being deprecated. It is to be removed in PyMAPDL version 0.66.0.\n" "Please use ``remove_temp_dir_on_exit`` instead.", DeprecationWarning, stacklevel=2, @@ -1637,7 +1677,7 @@ def launch_mapdl( start_parm, start_instance, version, - ) + ) # type: ignore mapdl = MapdlGrpc( ip=ip, @@ -1727,16 +1767,20 @@ def launch_mapdl( additional_switches = _check_license_argument(license_type, additional_switches) LOG.debug(f"Using additional switches {additional_switches}.") - # Setting number of processors - machine_cores = psutil.cpu_count(logical=False) - if not nproc: - if machine_cores < 2: # default required cores - nproc = machine_cores # to avoid starting issues + # Bypassing number of processors checks because VDI/VNC might have + # different number of processors than the cluster compute nodes. + if not ON_SLURM: + # Setting number of processors + machine_cores = psutil.cpu_count(logical=False) + + if not nproc: + # Some machines only have 1 core + nproc = machine_cores if machine_cores < 2 else 2 else: - nproc = 2 - else: - if machine_cores < int(nproc): - raise NotEnoughResources + if machine_cores < int(nproc): + raise NotEnoughResources( + f"The machine has {machine_cores} cores. PyMAPDL is asking for {nproc} cores." + ) start_parm.update( { @@ -1791,7 +1835,7 @@ def launch_mapdl( start_parm, start_instance, version, - ) + ) # type: ignore port, actual_run_location, process = launch_grpc( port=port, @@ -2078,6 +2122,167 @@ def _parse_ip_route(output): return match[0] +def _parse_slurm_options( + exec_file: Optional[str], + jobname: str, + nproc: Optional[int], + ram: Optional[Union[str, int]], + additional_switches: str, + **kwargs: Dict[str, Any], +): + def get_value( + variable: str, + kwargs: Dict[str, Any], + default: Optional[Union[str, int, float]] = 1, + astype: Optional[Callable[[Any], Any]] = int, + ): + value_from_env_vars = os.environ.get(variable, None) + value_from_kwargs = kwargs.pop(variable, None) + value = value_from_kwargs or value_from_env_vars or default + if astype and value: + return astype(value) + else: + return value + + ## Getting env vars + SLURM_NNODES = get_value("SLURM_NNODES", kwargs) + LOG.info(f"SLURM_NNODES: {SLURM_NNODES}") + # ntasks is for mpi + SLURM_NTASKS = get_value("SLURM_NTASKS", kwargs) + LOG.info(f"SLURM_NTASKS: {SLURM_NTASKS}") + # Sharing tasks acrros multiple nodes (DMP) + # the format of this envvar is a bit tricky. Avoiding it for the moment. + # SLURM_TASKS_PER_NODE = int( + # kwargs.pop( + # "SLURM_TASKS_PER_NODE", os.environ.get("SLURM_TASKS_PER_NODE", 1) + # ) + # ) + + # cpus-per-task is for multithreading, + # sharing tasks across multiple CPUs in same node (SMP) + SLURM_CPUS_PER_TASK = get_value("SLURM_CPUS_PER_TASK", kwargs) + LOG.info(f"SLURM_CPUS_PER_TASK: {SLURM_CPUS_PER_TASK}") + + # Set to value of the --ntasks option, if specified. See SLURM_NTASKS. Included for backwards compatibility. + SLURM_NPROCS = get_value("SLURM_NPROCS", kwargs) + LOG.info(f"SLURM_NPROCS: {SLURM_NPROCS}") + + # Number of CPUs allocated to the batch step. + SLURM_CPUS_ON_NODE = get_value("SLURM_CPUS_ON_NODE", kwargs) + LOG.info(f"SLURM_CPUS_ON_NODE: {SLURM_CPUS_ON_NODE}") + + SLURM_MEM_PER_NODE = get_value( + "SLURM_MEM_PER_NODE", kwargs, default=None, astype=None + ) + LOG.info(f"SLURM_MEM_PER_NODE: {SLURM_MEM_PER_NODE}") + + SLURM_NODELIST = get_value( + "SLURM_NODELIST", kwargs, default="", astype=None + ).lower() + LOG.info(f"SLURM_NODELIST: {SLURM_NODELIST}") + + if not exec_file: + exec_file = os.environ.get("PYMAPDL_MAPDL_EXEC", None) + + if not exec_file: + # We should probably make a way to find it. + # We will use the module thing + pass + LOG.info(f"Using MAPDL executable in: {exec_file}") + + if not jobname: + jobname = os.environ.get("SLURM_JOB_NAME", "file") + LOG.info(f"Using jobname: {jobname}") + + # Checking specific env var + if not nproc: + nproc = os.environ.get("PYMAPDL_NPROC", None) + if nproc: + nproc = int(nproc) + + if not nproc: + ## Attempt to calculate the appropriate number of cores: + # Reference: https://stackoverflow.com/a/51141287/6650211 + # I'm assuming the env var makes sense. + # + # - SLURM_CPUS_ON_NODE is a property of the cluster, not of the job. + # + options = [ + # 4, # Fall back option + SLURM_CPUS_PER_TASK * SLURM_NTASKS, # (CPUs) + SLURM_NPROCS, # (CPUs) + # SLURM_NTASKS, # (tasks) Not necessary the number of CPUs, + # SLURM_NNODES * SLURM_TASKS_PER_NODE * SLURM_CPUS_PER_TASK, # (CPUs) + SLURM_CPUS_ON_NODE * SLURM_NNODES, # (cpus) + ] + LOG.info(f"On SLURM number of processors options {options}") + nproc = max(options) + + LOG.info(f"Setting number of CPUs to: {nproc}") + + if not ram: + if SLURM_MEM_PER_NODE: + # RAM argument is in MB, so we need to convert + + if SLURM_MEM_PER_NODE[-1] == "T": # tera + ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** 2 + elif SLURM_MEM_PER_NODE[-1] == "G": # giga + ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** 1 + elif SLURM_MEM_PER_NODE[-1].upper() == "k": # kilo + ram = int(SLURM_MEM_PER_NODE[:-1]) * (2**10) ** (-1) + else: # Mega + ram = int(SLURM_MEM_PER_NODE) + + LOG.info(f"Setting RAM to: {ram}") + + # We use "-dis " (with space) to avoid collision with user variables such + # as `-distro` or so + if "-dis " not in additional_switches and not additional_switches.endswith("-dis"): + additional_switches += " -dis" + + ## Getting the node list + machines = "" + # parsing nodes to list + if SLURM_NODELIST: + try: + p = subprocess.Popen( + ["scontrol", "show", "hostnames", f"{SLURM_NODELIST}"], + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + stderr = p.stderr.read().decode() + stdout = p.stdout.read().decode() + + if "Invalid hostlist" in stderr: + raise ValueError( + "The node list is invalid, or it could not be parsed.\n", + "Are you passing the nodes correctly?\n", + f"Nodes list: {SLURM_NODELIST}", + ) + if stderr: + raise RuntimeError(stderr) + nodes = stdout.strip().splitlines() + + machines = ":".join([f"{each_node}" for each_node in nodes]) + + # The following code creates the cmd line bit for MAPDL. It seems it + # is not needed in slurm. + # machines = " -machines " + ":".join([ + # f"{each_node}:{SLURM_CPUS_ON_NODE}" for each_node in nodes + # ]) + + # We do not need to inject the machines in MAPDL command line. + # additional_switches += machines + LOG.info(f"Using nodes configuration: {machines}") + + except Exception as e: + LOG.info( + f"The machines list could not be obtained.\nThis error occurred:\n{str(e)}" + ) + + return exec_file, jobname, nproc, ram, additional_switches + + def pack_parameters( port, ip, diff --git a/tests/conftest.py b/tests/conftest.py index 47438a40c2..cc7c99442b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -655,6 +655,31 @@ def mapdl(request, tmpdir_factory): ) +@pytest.fixture(scope="function") +def set_env_var(request, monkeypatch): + """Set an environment variable from given requests, this fixture must be used with `parametrize`""" + env_var_name = request.param[0] + env_var_value = request.param[1] + monkeypatch.setenv(f"{env_var_name}", f"{env_var_value}") + yield request.param + + +@pytest.fixture(scope="function") +def set_env_var_context(request, monkeypatch): + """Set MY_VARIABLE environment variable, this fixture must be used with `parametrize`""" + if not isinstance(request.param, (tuple, list)): + request_param = [request.param] + else: + request_param = request.param + + for each_dict in request_param: + for each_key, each_value in each_dict.items(): + if each_value is not None: + monkeypatch.setenv(f"{each_key}", f"{each_value}") + + yield request.param + + @pytest.fixture def path_tests(tmpdir): p1 = tmpdir.mkdir("./temp/") diff --git a/tests/test_launcher.py b/tests/test_launcher.py index 23aa6749af..f497d9ca3e 100644 --- a/tests/test_launcher.py +++ b/tests/test_launcher.py @@ -44,6 +44,7 @@ _force_smp_student_version, _is_ubuntu, _parse_ip_route, + _parse_slurm_options, _validate_MPI, _verify_version, get_start_instance, @@ -560,6 +561,155 @@ def test_deprecate_verbose(): launch_grpc(verbose=True) +@pytest.mark.parametrize( + "set_env_var_context,validation", + ( + pytest.param( + { + "SLURM_NNODES": None, + "SLURM_NTASKS": None, + "SLURM_CPUS_PER_TASK": None, + "SLURM_NPROCS": None, + "SLURM_CPUS_ON_NODE": None, + "SLURM_MEM_PER_NODE": None, + "SLURM_NODELIST": None, + }, + {"nproc": 1}, + id="No parameters supplied", + ), + pytest.param( + { + "SLURM_NNODES": 5, + "SLURM_NTASKS": 1, + "SLURM_CPUS_PER_TASK": 1, + "SLURM_NPROCS": 1, + "SLURM_CPUS_ON_NODE": 1, + "SLURM_MEM_PER_NODE": None, + "SLURM_NODELIST": None, + }, + {"nproc": 5}, + id="Testing NNODE only", + ), + pytest.param( + { + "SLURM_NNODES": 5, + "SLURM_NTASKS": 1, + "SLURM_CPUS_PER_TASK": 1, + "SLURM_NPROCS": 1, + "SLURM_CPUS_ON_NODE": 2, + "SLURM_MEM_PER_NODE": None, + "SLURM_NODELIST": None, + }, + {"nproc": 10}, + id="Testing NNODE and CPUS_ON_NODE only", + ), + pytest.param( + { + "SLURM_NNODES": 1, + "SLURM_NTASKS": 5, + "SLURM_CPUS_PER_TASK": 1, + "SLURM_NPROCS": 1, + "SLURM_CPUS_ON_NODE": 1, + "SLURM_MEM_PER_NODE": None, + "SLURM_NODELIST": None, + }, + {"nproc": 5}, + id="Testing NTASKS only", + ), + pytest.param( + { + "SLURM_NNODES": 1, + "SLURM_NTASKS": 5, + "SLURM_CPUS_PER_TASK": 2, + "SLURM_NPROCS": 1, + "SLURM_CPUS_ON_NODE": 1, + "SLURM_MEM_PER_NODE": None, + "SLURM_NODELIST": None, + }, + {"nproc": 10}, + id="Testing NTASKS only", + ), + pytest.param( + { + "SLURM_NNODES": 2, + "SLURM_NTASKS": 2, + "SLURM_CPUS_PER_TASK": 2, + "SLURM_NPROCS": 18, + "SLURM_CPUS_ON_NODE": None, + "SLURM_MEM_PER_NODE": None, + "SLURM_NODELIST": None, + }, + {"nproc": 18}, + id="Testing NPROCS only", + ), + pytest.param( + # This test probably does not do a good memory mapping between + # MEM_PER_NODE and "ram" + { + "SLURM_NNODES": 4, + "SLURM_NTASKS": 2, + "SLURM_CPUS_PER_TASK": 2, + "SLURM_NPROCS": None, + "SLURM_CPUS_ON_NODE": None, + "SLURM_MEM_PER_NODE": "1000", + "SLURM_NODELIST": None, + }, + {"nproc": 4, "ram": 1000}, + id="Testing NNODES and MEM_PER_NODE", + ), + pytest.param( + { + "PYMAPDL_NPROC": 5, + "SLURM_JOB_NAME": "myawesomejob", + "SLURM_NTASKS": 2, + "SLURM_CPUS_PER_TASK": 2, + "SLURM_NPROCS": 1, + "SLURM_CPUS_ON_NODE": None, + "SLURM_MEM_PER_NODE": None, + "SLURM_NODELIST": None, + }, + {"nproc": 5, "jobname": "myawesomejob"}, + id="Testing PYMAPDL_NPROC and SLURM_JOB_NAME", + ), + pytest.param( + { + "PYMAPDL_NPROC": 5, + "SLURM_JOB_NAME": "myawesomejob", + "SLURM_NTASKS": 2, + "SLURM_CPUS_PER_TASK": 2, + "SLURM_NPROCS": 1, + "SLURM_CPUS_ON_NODE": None, + "SLURM_MEM_PER_NODE": None, + "SLURM_NODELIST": None, + "PYMAPDL_MAPDL_EXEC": "asdf/qwer/poiu", + }, + {"nproc": 5, "jobname": "myawesomejob", "exec_file": "asdf/qwer/poiu"}, + id="Testing PYMAPDL_NPROC and SLURM_JOB_NAME", + ), + ), + indirect=["set_env_var_context"], +) +def test__parse_slurm_options(set_env_var_context, validation): + """test slurm env vars""" + for each_key, each_value in set_env_var_context.items(): + if each_value: + assert os.environ.get(each_key) == str(each_value) + + exec_file, jobname, nproc, ram, additional_switches = _parse_slurm_options( + exec_file=None, jobname="", nproc=None, ram=None, additional_switches="" + ) + assert nproc == validation["nproc"] + + if ram: + assert ram == validation["ram"] + + if jobname != "file": + assert jobname == validation["jobname"] + + if exec_file and validation.get("exec_file", None): + assert exec_file == validation["exec_file"] + + @pytest.mark.parametrize( "start_instance,context", [