deepjavalibrary · zachgk · Oct 26, 2023
@@ -13,6 +13,7 @@
 package ai.djl.python.engine;
 
 import ai.djl.Device;
+import ai.djl.Device.MultiDevice;
 import ai.djl.Model;
 import ai.djl.engine.EngineException;
 import ai.djl.inference.streaming.ChunkedBytesSupplier;
@@ -123,7 +124,11 @@ CompletableFuture<Output> send(Input input) throws InterruptedException {
  static String[] getPythonStartCmd(PyEnv pyEnv, Model model, int workerId, int port) {
  Device device = model.getNDManager().getDevice();
  int deviceId = device.getDeviceId();
- int tensorParallelDegree = pyEnv.getTensorParallelDegree();
+ int tensorParallelDegree = 0;
+ if (model.getNDManager().getDevice() instanceof MultiDevice) {
+ tensorParallelDegree =
+ ((MultiDevice) model.getNDManager().getDevice()).getDevices().size();
+ }
  if (pyEnv.isMpiMode()) {
  String cudaDevices = getVisibleDevices(workerId, tensorParallelDegree);
  logger.info("Set CUDA_VISIBLE_DEVICES={}", cudaDevices);

@@ -12,9 +12,9 @@
  */
 package ai.djl.python.engine;
 
+import ai.djl.Device.MultiDevice;
 import ai.djl.Model;
 import ai.djl.engine.EngineException;
-import ai.djl.util.NeuronUtils;
 import ai.djl.util.Platform;
 import ai.djl.util.Utils;
 import ai.djl.util.cuda.CudaUtils;
@@ -52,7 +52,6 @@ public class PyEnv {
  private String handler;
  private int predictTimeout;
  private int modelLoadingTimeout;
- private int tensorParallelDegree;
  private Map<String, String> envs;
  private Map<String, String> initParameters;
  private boolean initialized;
@@ -302,41 +301,7 @@ public void setPythonExecutable(String pythonExecutable) {
  this.pythonExecutable = pythonExecutable;
  }
 
- /**
- * Returns the tensor parallel degree.
- *
- * @return the tensor parallel degree
- */
- public int getTensorParallelDegree() {
- if (tensorParallelDegree == 0) {
- String value = Utils.getenv("TENSOR_PARALLEL_DEGREE");
- if ("max".equals(value)) {
- tensorParallelDegree = getDefaultTensorParallelDegree();
- } else if (value != null) {
- tensorParallelDegree = Integer.parseInt(value);
- }
- }
- return tensorParallelDegree;
- }
-
- static int getDefaultTensorParallelDegree() {
- int gpus = CudaUtils.getGpuCount();
- if (gpus > 0) {
- return gpus;
- }
- return NeuronUtils.getNeuronCores();
- }
-
- /**
- * Sets the tensor parallel degree.
- *
- * @param tensorParallelDegree the tensor parallel degree
- */
- public void setTensorParallelDegree(int tensorParallelDegree) {
- this.tensorParallelDegree = tensorParallelDegree;
- }
-
- int getMpiWorkers() {
+ int getMpiWorkers(MultiDevice multiDevice) {
  int gpuCount = CudaUtils.getGpuCount();
  String visibleDevices = Utils.getenv("CUDA_VISIBLE_DEVICES");
  if (gpuCount > 0 && visibleDevices != null) {
@@ -346,7 +311,8 @@ int getMpiWorkers() {
  }
  gpuCount = visibleCount;
  }
- return gpuCount / getTensorParallelDegree();
+ int tensorParallelDegree = multiDevice.getDevices().size();
+ return gpuCount / tensorParallelDegree;
  }
 
  /**

@@ -14,14 +14,14 @@
 
 import ai.djl.BaseModel;
 import ai.djl.Device;
+import ai.djl.Device.MultiDevice;
 import ai.djl.Model;
 import ai.djl.engine.EngineException;
 import ai.djl.inference.Predictor;
 import ai.djl.ndarray.NDManager;
 import ai.djl.ndarray.types.DataType;
 import ai.djl.translate.Translator;
 import ai.djl.util.Utils;
-import ai.djl.util.cuda.CudaUtils;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -127,13 +127,6 @@ public void load(Path modelPath, String prefix, Map<String, ?> options) throws I
  case "parallel_loading":
  parallelLoading = Boolean.parseBoolean(value);
  break;
- case "tensor_parallel_degree":
- if ("max".equals(value)) {
- pyEnv.setTensorParallelDegree(PyEnv.getDefaultTensorParallelDegree());
- } else {
- pyEnv.setTensorParallelDegree(Integer.parseInt(value));
- }
- break;
  case "handler":
  pyEnv.setHandler(value);
  break;
@@ -164,8 +157,13 @@ public void load(Path modelPath, String prefix, Map<String, ?> options) throws I
  entryPoint = modelFile.toFile().getName();
  } else if ("DeepSpeed".equals(engineName)) {
  entryPoint = "djl_python.deepspeed";
- } else if ("nc".equals(manager.getDevice().getDeviceType())
- && pyEnv.getTensorParallelDegree() > 0) {
+ } else if (manager.getDevice() instanceof MultiDevice
+ && "nc"
+ .equals(
+ ((MultiDevice) manager.getDevice())
+ .getDevices()
+ .get(0)
+ .getDeviceType())) {
  entryPoint = "djl_python.transformers_neuronx";
  } else if (isTrtLlmBackend) {
  entryPoint = "djl_python.tensorrt_llm";
@@ -200,17 +198,13 @@ public void load(Path modelPath, String prefix, Map<String, ?> options) throws I
  }
 
  if (pyEnv.isMpiMode()) {
- int partitions = pyEnv.getTensorParallelDegree();
- if (partitions == 0) {
- partitions = CudaUtils.getGpuCount();
- pyEnv.setTensorParallelDegree(partitions);
- setProperty("tensor_parallel_degree", String.valueOf(partitions));
- logger.info(
- "No tensor parallel degree specified. Defaulting to all available GPUs.");
- }
- logger.info("Loading model in MPI mode with TP: {}.", partitions);
+ MultiDevice multiDevice = (MultiDevice) manager.getDevice();
+ int partitions = multiDevice.getDevices().size();
+ setProperty("tensor_parallel_degree", String.valueOf(partitions));
+ logger.info(
+ "Loading model in MPI mode with TP: {}. Devices {}", partitions, multiDevice);
 
- int mpiWorkers = pyEnv.getMpiWorkers();
+ int mpiWorkers = pyEnv.getMpiWorkers(multiDevice);
  if (mpiWorkers <= 0) {
  throw new EngineException(
  "GPU devices are not enough to run " + partitions + " partitions.");
@@ -233,13 +227,14 @@ public void load(Path modelPath, String prefix, Map<String, ?> options) throws I
  + " but the value is set to "
  + getProperty("gpu.maxWorkers"));
  }
- mpiWorkers = Integer.parseInt(getProperty("gpu.maxWorkers"));
-
  properties.forEach((k, v) -> pyEnv.addParameter(k, v));
 
- createAllPyProcesses(mpiWorkers, partitions);
+ createAllPyProcesses(multiDevice.getDevices(), partitions);
  } else {
- int tensorParallelDegree = pyEnv.getTensorParallelDegree();
+ int tensorParallelDegree = 0;
+ if (manager.getDevice() instanceof MultiDevice) {
+ tensorParallelDegree = ((MultiDevice) manager.getDevice()).getDevices().size();
+ }
  if (tensorParallelDegree > 0) {
  if (getProperty("maxWorkers") == null && getProperty("gpu.maxWorkers") == null) {
  setProperty("gpu.minWorkers", "1");
@@ -300,21 +295,22 @@ private Path findModelFile(String prefix) {
  return modelFile;
  }
 
- private void createAllPyProcesses(int mpiWorkers, int tp) {
+ private void createAllPyProcesses(List<Device> devices, int tp) {
+ int mpiWorkers = devices.size();
  long begin = System.currentTimeMillis();
  ExecutorService pool = null;
  List<Future<?>> futures = new ArrayList<>();
  if (parallelLoading) {
  pool = Executors.newFixedThreadPool(mpiWorkers);
  }
  logger.info("Start {} mpiWorkers ...", mpiWorkers);
- int deviceId = manager.getDevice().getDeviceId();
- for (int i = 0; i < mpiWorkers; ++i) {
- logger.debug("Pre-creating python worker: {} ", i);
- PyProcess worker = new PyProcess(this, pyEnv, deviceId + i * tp);
+ for (Device device : devices) {
+  int deviceId = device.getDeviceId();
+ logger.debug("Pre-creating python worker: {} ", deviceId);
+ PyProcess worker = new PyProcess(this, pyEnv, deviceId * tp);
  workerQueue.offer(worker);
  if (pool != null) {
- logger.debug("Submitting to pool: {}", i);
+ logger.debug("Submitting to pool: {}", deviceId);
  futures.add(pool.submit(worker::startPythonProcess));
  } else {
  worker.startPythonProcess();

@@ -12,6 +12,8 @@
  */
 package ai.djl.python.engine;
 
+import ai.djl.Device;
+import ai.djl.Device.MultiDevice;
 import ai.djl.Model;
 import ai.djl.engine.EngineException;
 import ai.djl.metric.Metric;
@@ -59,12 +61,12 @@ class PyProcess {
  this.workerId = workerId;
  int port = counter.getAndIncrement();
  if (pyEnv.isMpiMode()) {
- int tensorParallelDegree = pyEnv.getTensorParallelDegree();
- connections = new ArrayList<>(tensorParallelDegree);
- for (int i = 0; i < tensorParallelDegree; ++i) {
+ List<Device> devices = ((MultiDevice) model.getNDManager().getDevice()).getDevices();
+ connections = new ArrayList<>(devices.size());
+ for (int i = 0; i < devices.size(); ++i) {
  connections.add(new Connection(pyEnv, port, i));
  }
- counter.set(port + tensorParallelDegree);
+ counter.set(port + devices.size());
  } else {
  connections = Collections.singletonList(new Connection(pyEnv, port, -1));
  }