Skip to content

Commit

Permalink
Updating GPU (accelerator) support in MLCube. (mlcommons#351)
Browse files Browse the repository at this point in the history
This commit changes how MLCube runtime works with GPUs. Users have two options to provide information on required accelerators:
- It can be done in MLCube configuration file in the `platform` section (platform.accelerator_count). This value is optional, and if present, may be an empy / non-set string, or an integer. This values is the number of required accelerators, and semantically, is equivalent to docker's `--gpu=N` CLI argument.
- It can be done on a command line using `--gpus` argument, e.g., `mlcube run ... --gpus=4`. This parameter accepts same values as docker's `--gpus` CLI argument does. Concretely:
  - When not set MLCube will use platform.accelerator_count value if present.
  - When set, this overrides any value assigned to platform.accelerator_count. Could be empty (`--gpus=`) to disable GPUs, all (`--gpus=all`) to use all available GPUs, GPU count (`--gpus=N`) or list of concrete GPUs (`--gpus="device=0,2"`). In this list GPU indices or UUIDs can be used.

Docker runner will pretty much use this value unmodified and will pass it to docker run command, e.g., `--gpus` flag will present. Singularity runner will pass `--nv` command when GPUs are requested. No CUDA_VISIBLE_DEVICES, SINGULARITYENV_CUDA_VISIBLE_DEVICES or any other env variable will be set by MLCube runtime (docker NVIDIA runtime may set NVIDIA_VISIBLE_DEVICES).

To debug for possble issues, enable debug mode (e.g., `mlcube --log-level=debug run ...`) and search the output for the log lines that contain `DEBUG Device spec (...) resolved to ...` and `INFO Device params ... resolved to ...`. They will provide additional information on how MLCube runtime determines how GPUs should be used.
  • Loading branch information
sergey-serebryakov authored Jan 18, 2024
1 parent 5f79b7a commit 8833ec0
Show file tree
Hide file tree
Showing 3 changed files with 398 additions and 32 deletions.
232 changes: 220 additions & 12 deletions mlcube/mlcube/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@
- `CliParser`: Helper utilities to parse command linea arguments.
"""
import abc
import logging
import os
import typing as t
from dataclasses import dataclass

from omegaconf import DictConfig, OmegaConf

from mlcube.errors import ConfigurationError

logger = logging.getLogger(__name__)


class MLCubeInstance(abc.ABC):
"""Base class for different instantiations of MLCube (local, remote, directory, archive, container ...)."""
Expand Down Expand Up @@ -44,6 +50,213 @@ def uri(self) -> str:
return os.path.join(self.path, self.file)


class DeviceSpecs:
"""GPU specifications."""
@dataclass
class Device:
index: t.Optional[int] = None # GPU index in a system (depends on ordering - see `CUDA_DEVICE_ORDER`).
uuid: t.Optional[str] = None # always unique, e.g., GPU-1f22a253-c329-dfb7-0db4-e005efb6a4c7.

@classmethod
def create(cls, device: str) -> "DeviceSpecs.Device":
device = device.strip()
try:
return cls(index=int(device))
except ValueError:
return cls(uuid=device)

def str_spec(self) -> str:
return str(self.index) if self.index is not None else self.uuid

def __str__(self) -> str:
return f"Device(index={self.index}, uuid={self.uuid})"

@dataclass
class DockerSpecs:
gpus: t.Optional[str] = None
cuda_visible_devices: t.Optional[str] = None

def __init__(self) -> None:
"""Init GPU specs so that it's none by default.
The values are set in factory methods. Only one instance variable out of 4 must be set.
"""
self._none: bool = True
"""No GPUs requested - use this to check if GPUs have not been requested."""
self._all: bool = False
"""All GPUs requested (--gpus=all)."""
self._num_devices: t.Optional[int] = None # Just number of GPUs
"""Some number of GPUs requested (--gpus=2 for 2 (first) GPUs) - this either None or positive, never zero."""
self._devices: t.Optional[t.List[DeviceSpecs.Device]] = None
"""List of GPU indices or IDs (--gpus=1,2 or --gpus=GPU-f234r2f23) - either None or at least one device."""

@property
def none(self) -> bool:
return self._none

@property
def all(self) -> bool:
return self._all

@property
def num_devices(self) -> t.Optional[int]:
return self._num_devices

@property
def devices(self) -> t.Optional[t.List["DeviceSpecs.Device"]]:
return self._devices if self._devices is None else self._devices.copy()

def check_with_platform_specs(self, accelerator_count: t.Optional[int] = None) -> None:
if accelerator_count is None:
# Number of accelerators is not specified in the MLCube configuration file. We assume this is really
# optional now, so will not be doing any further checks.
return
if accelerator_count < 0:
# This should generally never happen here since MLCube needs to validate values before this method is
# called (what is normally should happen right before MLCube runs the MLCube project).
return
if accelerator_count == 0:
# The accelerator count value has been set to 0
if self.none:
logger.info(
"`platform.accelerator_count = 0` is consistent with device specs (%s).", str(self)
)
return
logger.warning(
"`platform.accelerator_count = 0` is not consistent with device specs (%s).", str(self)
)
# Some number of accelerators has been specified
if (
self.all is True or
self.num_devices is not None and self.num_devices == accelerator_count or
self._devices is not None and len(self._devices) == accelerator_count
):
logger.info(
"`platform.accelerator_count = %d` is probably consistent with device specs (%s).",
accelerator_count, str(self)
)
logger.warning(
"`platform.accelerator_count = %d` is not consistent with device specs (%s).",
accelerator_count, str(self)
)

def get_docker_specs(self) -> "DeviceSpecs.DockerSpecs":
if self.none:
# Do not provide --gpus flag
return DeviceSpecs.DockerSpecs()
if self.all:
# provide --gpus=all, do not know yet how to compute total number of devices
logger.warning(
"Device docker specs: identifying CUDA_VISIBLE_DEVICES when gpus = 'all' is not supported yet."
)
return DeviceSpecs.DockerSpecs(gpus="all")
if self.num_devices is not None:
# --gpus=N, CUDA_VISIBLE_DEVICES=0,1,2,..N-1
return DeviceSpecs.DockerSpecs(
gpus=f"{self.num_devices}",
cuda_visible_devices=",".join(str(i) for i in range(self.num_devices))
)
# --gpus=device=A,B,C, CUDA_VISIBLE_DEVICES=0,1,2,..N-1
assert isinstance(self._devices, list) and len(self._devices) > 0
return DeviceSpecs.DockerSpecs(
gpus="device=" + ",".join(dev.str_spec() for dev in self._devices),
cuda_visible_devices=",".join(str(i) for i in range(len(self._devices)))
)

@classmethod
def from_string(cls, gpus: t.Optional[str] = None) -> "DeviceSpecs":
_gpus = str(gpus)
gpus = (gpus or "").strip()

# No GPUs requested
if not gpus:
logger.debug("Device specs (`%s`) resolved to `none`.", _gpus)
return DeviceSpecs()

# Exposes all available GPUs (e.g., 8). To set CUDA_VISIBLE_DEVICES, MLCube needs to identify the number of
# available GPUs. The NVIDIA_VISIBLE_DEVICES=all will be set automatically by docker.
if gpus == "all":
logger.debug("Device specs (`%s`) resolved to `all`.", _gpus)
gpu_specs = DeviceSpecs()
gpu_specs._none = False
gpu_specs._all = True
return gpu_specs

# Exposes "first" N GPUs. CUDA_VISIBLE_DEVICES is set to list(0, range(N)). The NVIDIA_VISIBLE_DEVICES will be
# set automatically (in this case, it seems like these two environment variables will have the same value).
try:
num_gpus = int(gpus)
if num_gpus <= 0:
logger.debug("Device spec (`%s`) resolved to `none`.", _gpus)
return DeviceSpecs()
logger.debug("Device spec (`%s`) resolved to device count (num_devices=%d)", _gpus, num_gpus)
gpu_specs = DeviceSpecs()
gpu_specs._none = False
gpu_specs._num_devices = num_gpus
return gpu_specs
except ValueError:
...

if gpus.startswith("device="):
_devices = [
device for device in (
device.strip() for device in gpus[7:].split(",")
) if device
]
if not _devices:
logger.debug("Device spec (`%s`) resolved to `none`.", _gpus)
return DeviceSpecs()

logger.debug("Device spec (`%s`) resolved to device list (devices=%s).", _gpus, _devices)
gpu_specs = DeviceSpecs()
gpu_specs._none = False
gpu_specs._devices = [DeviceSpecs.Device.create(device) for device in _devices]
return gpu_specs

# Do not know how to parse
raise ConfigurationError(
f"The `gpus` configuration parameter has invalid or unsupported value (gpus=`{gpus}`). It can take one"
"of the following values: (1) empty or not specified, (2) 'all', (3) be single integer value or (4) "
"be a string that starts with 'device=' that is followed by a comma-separated list of integers or device "
"IDs (strings). For more details, see "
"https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html."
)

@classmethod
def from_config(cls, accelerator_count: t.Optional[int] = None, gpus: t.Optional[str] = None) -> "DeviceSpecs":
"""Probably a temporary workaround to infer device specs based upon platform section and CLI's --gpus arg."""

# Determine initial device specs. The `accelerator_count` is part of the MLCube's `platform` section that
# defines (optionally) number of GPUs required by an MLCube. If not set, specs set to none (not needed).
if accelerator_count is not None and accelerator_count > 0:
platform_device_specs = DeviceSpecs.from_string(str(accelerator_count))
else:
platform_device_specs = DeviceSpecs()

# The `gpus` arg is none when it's not provided by a user. We need to differentiate between not set (None) and
# empty (e.g., --gpus="") what means disable GPUs.
if gpus is None:
device_specs = platform_device_specs
else:
# Get device specs from CLI (e.g., mlcube run ... --gpus='"device=1,5"'). These always, always override
# platform device specs when present. We also print out warning (no actions will be taken) if user specs
# (--gpus) conflict with platform specs (platform.accelerator_count).
device_specs = DeviceSpecs.from_string(gpus)
device_specs.check_with_platform_specs(platform_device_specs.num_devices)

logger.info(
"Device params `platform.accelerator_count` (%s) and `--gpus` (%s) resolved to device specs (%s).",
str(accelerator_count), str(gpus), str(device_specs)
)

return device_specs

def __str__(self) -> str:
devices = "None" if self._devices is None else str([str(dev) for dev in self._devices])
return f"{self.__class__.__name__}(none={self._none}, all={self._all}, num_gpus={self._num_devices}, "\
f"devices={devices})"


class CliParser(object):
"""Helper utilities to parse command linea arguments."""

Expand Down Expand Up @@ -96,7 +309,7 @@ def parse_extra_arg(
`--` prefix, and are normally parsed by libraries such as `click` or `argparse`. This dictionary will
also include such arguments as `--platform`, `--mlcube` and others. Keys in this dictionary are
argument names without `--` prefix. The following is the list of arguments this function can parse:
- `platform`: Platform to use to run this MLCube (docker, singularity, gcp, k8s etc).
- `platform`: Platform to use to run this MLCube (docker, singularity, gcp, k8s, etc.).
- `network_option`: Networking options defined during MLCube container execution.
- `security_option`: Security options defined during MLCube container execution.
- `gpus_option`: GPU usage options defined during MLCube container execution.
Expand All @@ -116,7 +329,9 @@ def parse_extra_arg(
]
task_args = {arg[0]: arg[1] for arg in task_args}

# Parse unparsed arguments
# Set runner-specific parameters - think about refactoring, this needs to be done by runners.
# Orr, maybe, these parameters can go first into platform section, and then be parsed by runners.
# When not present, they need to be None. Empty values (e.g., --gpus="") will be interpreted as set.
platform: t.Optional[str] = parsed_args.get("platform", None)
if platform in {"docker", "singularity"}:
runner_run_args = {}
Expand All @@ -125,18 +340,11 @@ def parse_extra_arg(
if parsed_args.get("security", None):
key = "--security-opt" if platform == "docker" else "--security"
runner_run_args[key] = parsed_args["security"]
if parsed_args.get("gpus", None):
cuda_visible_devices = parsed_args["gpus"]
if "device" in cuda_visible_devices:
cuda_visible_devices = cuda_visible_devices.replace("device=", "")
elif str(cuda_visible_devices).isnumeric():
cuda_visible_devices = str(list(range(int(cuda_visible_devices))))
cuda_visible_devices = cuda_visible_devices.replace(" ", "")[1:-1]
if parsed_args.get("gpus", None) is not None:
if platform == "docker":
runner_run_args["--gpus"] = cuda_visible_devices
runner_run_args["--gpus"] = parsed_args["gpus"].strip()
else:
runner_run_args["--nv"] = ""
os.environ["SINGULARITYENV_CUDA_VISIBLE_DEVICES"] = cuda_visible_devices
if parsed_args.get("memory", None):
key = "--memory" if platform == "docker" else "--vm-ram"
runner_run_args[key] = parsed_args["memory"]
Expand All @@ -160,7 +368,7 @@ def parse_optional_arg(
cpu_option: t.Optional[str],
mount_option: t.Optional[str],
) -> t.Tuple[DictConfig, t.Dict]:
"""platform: Platform to use to run this MLCube (docker, singularity, gcp, k8s etc).
"""platform: Platform to use to run this MLCube (docker, singularity, gcp, k8s, etc.).
network_option: Networking options defined during MLCube container execution.
security_option: Security options defined during MLCube container execution.
gpus_option: GPU usage options defined during MLCube container execution.
Expand Down
Loading

0 comments on commit 8833ec0

Please sign in to comment.