From 3351caa5a63a79bb2118413fd7c625f936e8f185 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 28 Jun 2024 11:58:36 -0700 Subject: [PATCH 1/8] init --- sky/skylet/providers/azure/node_provider.py | 50 +++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 068930eb390..de7586bd027 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -328,6 +328,56 @@ def _create_node(self, node_config, tags, count): parameters=parameters, ).wait() + # Configure driver extension for A10 GPUs + # TODO(tian): Only do this for A10 vms. + logger.info("Begin to configure A10 driver extension for VM: %s", vm_name) + self._configure_a10_driver_extension(vm_name) + logger.info("A10 driver extension configured for VM: %s", vm_name) + + def _configure_a10_driver_extension(self, vm_name): + resource_group = self.provider_config["resource_group"] + parameters = { + "properties": { + "mode": DeploymentMode.incremental, + "template": { + "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "vmName": { + "type": "string", + "metadata": { + "description": "Name of the virtual machine" + } + } + }, + "resources": [ + { + "type": "Microsoft.Compute/virtualMachines/extensions", + "apiVersion": "2015-06-15", + "location": "[resourceGroup().location]", + "name": "[concat(parameters('vmName'),'/NvidiaGpuDriverLinux')]", + "properties": { + "publisher": "Microsoft.HpcCompute", + "type": "NvidiaGpuDriverLinux", + "typeHandlerVersion": "1.9", + "autoUpgradeMinorVersion": True, + "settings": {}, + }, + } + ], + }, + "parameters": {"vmName": {"value": vm_name}}, + } + } + create_or_update = get_azure_sdk_function( + client=self.resource_client.deployments, function_name="create_or_update" + ) + create_or_update( + resource_group_name=resource_group, + deployment_name=vm_name, + parameters=parameters, + ).wait() + @synchronized def set_node_tags(self, node_id, tags): """Sets the tag values (string dict) for the specified node.""" From 74b669ce7e378d702c11f52f147196a6e839077f Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 28 Jun 2024 19:02:04 -0700 Subject: [PATCH 2/8] works. todo: only do this for A10 VMs --- sky/skylet/providers/azure/node_provider.py | 26 ++++++++++++++------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index de7586bd027..9ec354d2344 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -322,17 +322,29 @@ def _create_node(self, node_config, tags, count): create_or_update = get_azure_sdk_function( client=self.resource_client.deployments, function_name="create_or_update" ) - create_or_update( + poller = create_or_update( resource_group_name=resource_group, deployment_name=vm_name, parameters=parameters, - ).wait() + ) + poller.wait() # Configure driver extension for A10 GPUs # TODO(tian): Only do this for A10 vms. - logger.info("Begin to configure A10 driver extension for VM: %s", vm_name) - self._configure_a10_driver_extension(vm_name) - logger.info("A10 driver extension configured for VM: %s", vm_name) + create_result = poller.result().as_dict() + output_resources = create_result.get("properties", {}).get( + "output_resources", [] + ) + vms_to_add_driver = [] + for r in output_resources: + r_id = r.get("id", "") + if "Microsoft.Compute/virtualMachines" in r_id: + vms_to_add_driver.append(r_id.split("/")[-1]) + + for v in vms_to_add_driver: + logger.info(f"Begin to configure A10 driver extension for VM: {v}") + self._configure_a10_driver_extension(v) + logger.info(f"A10 driver extension configured for VM: {v}") def _configure_a10_driver_extension(self, vm_name): resource_group = self.provider_config["resource_group"] @@ -345,9 +357,7 @@ def _configure_a10_driver_extension(self, vm_name): "parameters": { "vmName": { "type": "string", - "metadata": { - "description": "Name of the virtual machine" - } + "metadata": {"description": "Name of the virtual machine"}, } }, "resources": [ From d4a2f413a725ca8aba9d69e01444fdaa87c24035 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Fri, 28 Jun 2024 19:27:52 -0700 Subject: [PATCH 3/8] only install for A10 instances --- sky/skylet/providers/azure/node_provider.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 9ec354d2344..76872839a82 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -329,8 +329,15 @@ def _create_node(self, node_config, tags, count): ) poller.wait() + # pylint: disable=import-outside-toplevel + from sky.clouds.service_catalog import azure_catalog + + instance_type = node_config["azure_arm_parameters"].get("vmSize", "") + accs = azure_catalog.get_accelerators_from_instance_type(instance_type) + if accs is None or "A10" not in accs: + return + # Configure driver extension for A10 GPUs - # TODO(tian): Only do this for A10 vms. create_result = poller.result().as_dict() output_resources = create_result.get("properties", {}).get( "output_resources", [] From 823ed8d04367145bb83816119c2ccab1a3ac727c Mon Sep 17 00:00:00 2001 From: cblmemo Date: Mon, 1 Jul 2024 20:46:16 -0700 Subject: [PATCH 4/8] merge into one template --- sky/skylet/providers/azure/node_provider.py | 86 ++++++--------------- 1 file changed, 25 insertions(+), 61 deletions(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 76872839a82..c7653144698 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -308,71 +308,27 @@ def _create_node(self, node_config, tags, count): template_params["nsg"] = self.provider_config["nsg"] template_params["subnet"] = self.provider_config["subnet"] - parameters = { - "properties": { - "mode": DeploymentMode.incremental, - "template": template, - "parameters": { - key: {"value": value} for key, value in template_params.items() - }, - } - } - - # TODO: we could get the private/public ips back directly - create_or_update = get_azure_sdk_function( - client=self.resource_client.deployments, function_name="create_or_update" - ) - poller = create_or_update( - resource_group_name=resource_group, - deployment_name=vm_name, - parameters=parameters, - ) - poller.wait() - # pylint: disable=import-outside-toplevel from sky.clouds.service_catalog import azure_catalog instance_type = node_config["azure_arm_parameters"].get("vmSize", "") accs = azure_catalog.get_accelerators_from_instance_type(instance_type) - if accs is None or "A10" not in accs: - return - - # Configure driver extension for A10 GPUs - create_result = poller.result().as_dict() - output_resources = create_result.get("properties", {}).get( - "output_resources", [] - ) - vms_to_add_driver = [] - for r in output_resources: - r_id = r.get("id", "") - if "Microsoft.Compute/virtualMachines" in r_id: - vms_to_add_driver.append(r_id.split("/")[-1]) - - for v in vms_to_add_driver: - logger.info(f"Begin to configure A10 driver extension for VM: {v}") - self._configure_a10_driver_extension(v) - logger.info(f"A10 driver extension configured for VM: {v}") - - def _configure_a10_driver_extension(self, vm_name): - resource_group = self.provider_config["resource_group"] - parameters = { - "properties": { - "mode": DeploymentMode.incremental, - "template": { - "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", - "contentVersion": "1.0.0.0", - "parameters": { - "vmName": { - "type": "string", - "metadata": {"description": "Name of the virtual machine"}, - } - }, - "resources": [ + if accs is not None and "A10" in accs: + # Configure driver extension for A10 GPUs. A10 GPUs requires a + # special type of drivers which is available at Microsoft HPC + # extension. Reference: https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2 + for r in template["resources"]: + if r["type"] == "Microsoft.Compute/virtualMachines": + # Add a nested extension resource for A10 GPUs + r["resources"] = [ { - "type": "Microsoft.Compute/virtualMachines/extensions", + "type": "extensions", "apiVersion": "2015-06-15", - "location": "[resourceGroup().location]", - "name": "[concat(parameters('vmName'),'/NvidiaGpuDriverLinux')]", + "location": "[variables('location')]", + "dependsOn": [ + "[concat('Microsoft.Compute/virtualMachines/', parameters('vmName'), copyIndex())]" + ], + "name": "NvidiaGpuDriverLinux", "properties": { "publisher": "Microsoft.HpcCompute", "type": "NvidiaGpuDriverLinux", @@ -380,12 +336,20 @@ def _configure_a10_driver_extension(self, vm_name): "autoUpgradeMinorVersion": True, "settings": {}, }, - } - ], + }, + ] + + parameters = { + "properties": { + "mode": DeploymentMode.incremental, + "template": template, + "parameters": { + key: {"value": value} for key, value in template_params.items() }, - "parameters": {"vmName": {"value": vm_name}}, } } + + # TODO: we could get the private/public ips back directly create_or_update = get_azure_sdk_function( client=self.resource_client.deployments, function_name="create_or_update" ) From 4a4789aa631f81e299151e0f19d193820c5dd6db Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Wed, 3 Jul 2024 00:36:28 +0800 Subject: [PATCH 5/8] Update sky/skylet/providers/azure/node_provider.py Co-authored-by: Zhanghao Wu --- sky/skylet/providers/azure/node_provider.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index a260a304419..03b2b3beb6b 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -333,6 +333,7 @@ def _create_node(self, node_config, tags, count): }, }, ] + break parameters = { "properties": { From 4ccedce2051d80e7e5024f609a4fe8883bac9966 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 3 Jul 2024 09:15:54 -0700 Subject: [PATCH 6/8] add warning --- sky/backends/cloud_vm_ray_backend.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 89f9dcdc695..6fe4211f102 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2020,8 +2020,16 @@ def provision_with_retries( failover_history: List[Exception] = list() style = colorama.Style + fore = colorama.Fore # Retrying launchable resources. while True: + if (isinstance(to_provision.cloud, clouds.Azure) and + to_provision.accelerators is not None and + 'A10' in to_provision.accelerators): + logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch ' + 'an A10 cluster on Azure. This may take ~20 ' + 'minutes due to driver installation.' + f'{style.RESET_ALL}') try: # Recheck cluster name as the 'except:' block below may # change the cloud assignment. From 018181f5c1c391c0f2ea32023c8e70051aa4a004 Mon Sep 17 00:00:00 2001 From: cblmemo Date: Wed, 3 Jul 2024 17:01:27 -0700 Subject: [PATCH 7/8] apply suggestions from code review --- sky/clouds/azure.py | 20 ++++++++++++-------- sky/skylet/providers/azure/node_provider.py | 7 +------ sky/templates/azure-ray.yml.j2 | 2 ++ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index b75f9207856..19fca673977 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -7,7 +7,7 @@ import subprocess import textwrap import typing -from typing import Dict, Iterator, List, Optional, Tuple +from typing import Any, Dict, Iterator, List, Optional, Tuple import colorama @@ -269,13 +269,12 @@ def get_vcpus_mem_from_instance_type( def get_zone_shell_cmd(cls) -> Optional[str]: return None - def make_deploy_resources_variables( - self, - resources: 'resources.Resources', - cluster_name_on_cloud: str, - region: 'clouds.Region', - zones: Optional[List['clouds.Zone']], - dryrun: bool = False) -> Dict[str, Optional[str]]: + def make_deploy_resources_variables(self, + resources: 'resources.Resources', + cluster_name_on_cloud: str, + region: 'clouds.Region', + zones: Optional[List['clouds.Zone']], + dryrun: bool = False) -> Dict[str, Any]: assert zones is None, ('Azure does not support zones', zones) region_name = region.name @@ -315,6 +314,10 @@ def make_deploy_resources_variables( 'image_version': version, } + # Setup the A10 nvidia driver. + need_nvidia_driver_extension = (resources.accelerators is not None and + 'A10' in resources.accelerators) + # Setup commands to eliminate the banner and restart sshd. # This script will modify /etc/ssh/sshd_config and add a bash script # into .bashrc. The bash script will restart sshd if it has not been @@ -367,6 +370,7 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: # Azure does not support specific zones. 'zones': None, **image_config, + 'need_nvidia_driver_extension': need_nvidia_driver_extension, 'disk_tier': Azure._get_disk_type(_failover_disk_tier()), 'cloud_init_setup_commands': cloud_init_setup_commands, 'azure_subscription_id': self.get_project_id(dryrun), diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 03b2b3beb6b..5f87e57245e 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -303,12 +303,7 @@ def _create_node(self, node_config, tags, count): template_params["nsg"] = self.provider_config["nsg"] template_params["subnet"] = self.provider_config["subnet"] - # pylint: disable=import-outside-toplevel - from sky.clouds.service_catalog import azure_catalog - - instance_type = node_config["azure_arm_parameters"].get("vmSize", "") - accs = azure_catalog.get_accelerators_from_instance_type(instance_type) - if accs is not None and "A10" in accs: + if node_config.get("need_nvidia_driver_extension", False): # Configure driver extension for A10 GPUs. A10 GPUs requires a # special type of drivers which is available at Microsoft HPC # extension. Reference: https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2 diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 66eac439453..e8c388e1879 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -80,6 +80,7 @@ available_node_types: # billingProfile: # maxPrice: -1 {%- endif %} + need_nvidia_driver_extension: {{need_nvidia_driver_extension}} # TODO: attach disk {% if num_nodes > 1 %} ray.worker.default: @@ -108,6 +109,7 @@ available_node_types: # billingProfile: # maxPrice: -1 {%- endif %} + need_nvidia_driver_extension: {{need_nvidia_driver_extension}} {%- endif %} head_node_type: ray.head.default From f7b9935182111de4672219b708b907707e75e213 Mon Sep 17 00:00:00 2001 From: Tian Xia Date: Fri, 5 Jul 2024 08:51:14 +0800 Subject: [PATCH 8/8] Update sky/clouds/azure.py Co-authored-by: Zhanghao Wu --- sky/clouds/azure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 19fca673977..916a1c01c7d 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -315,8 +315,8 @@ def make_deploy_resources_variables(self, } # Setup the A10 nvidia driver. - need_nvidia_driver_extension = (resources.accelerators is not None and - 'A10' in resources.accelerators) + need_nvidia_driver_extension = (acc_dict is not None and + 'A10' in acc_dict) # Setup commands to eliminate the banner and restart sshd. # This script will modify /etc/ssh/sshd_config and add a bash script