From 3c3ce613f468ad0a391574e5532f78ec03a678e7 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:51:25 -0400 Subject: [PATCH 01/20] feat: allow security group specification --- sky/clouds/aws.py | 18 ++++++++++++------ sky/resources.py | 6 ------ sky/utils/schemas.py | 20 ++++++++++++++++++-- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index abfb82c0596..8d140ab6711 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -399,15 +399,21 @@ def make_deploy_resources_variables(self, user_security_group = skypilot_config.get_nested( ('aws', 'security_group_name'), None) - if resources.ports is not None: + if user_security_group is not None and not isinstance( + user_security_group, str): + for sg_name in user_security_group: + if cluster_name_on_cloud.startswith( + sg_name) and sg_name != 'default': + user_security_group = user_security_group[sg_name] + break + elif sg_name == 'default': + user_security_group = user_security_group[sg_name] + security_group = user_security_group + if user_security_group is None and resources.ports is not None: # Already checked in Resources._try_validate_ports - assert user_security_group is None security_group = USER_PORTS_SECURITY_GROUP_NAME.format( cluster_name_on_cloud) - elif user_security_group is not None: - assert resources.ports is None - security_group = user_security_group - else: + elif user_security_group is None: security_group = DEFAULT_SECURITY_GROUP_NAME return { diff --git a/sky/resources.py b/sky/resources.py index 38f7a9784e6..adbcf745f89 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -929,12 +929,6 @@ def _try_validate_ports(self) -> None: """ if self.ports is None: return - if skypilot_config.get_nested(('aws', 'security_group_name'), - None) is not None: - with ux_utils.print_exception_no_traceback(): - raise ValueError( - 'Cannot specify ports when AWS security group name is ' - 'specified.') if self.cloud is not None: self.cloud.check_features_are_supported( self, {clouds.CloudImplementationFeatures.OPEN_PORTS}) diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index a529d61f2f6..7fe4689a6bc 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -114,6 +114,8 @@ def _get_single_resources_schema(): 'type': 'integer', }] } + }, { + 'type': 'null', }], }, 'labels': { @@ -683,7 +685,7 @@ def get_config_schema(): # Validation may fail if $schema is included. if k != '$schema' } - resources_schema['properties'].pop('ports') + resources_schema['properties'].pop('port', None) controller_resources_schema = { 'type': 'object', 'required': [], @@ -706,7 +708,21 @@ def get_config_schema(): 'additionalProperties': False, 'properties': { 'security_group_name': { - 'type': 'string' + 'oneOf': [{ + 'type': 'string' + }, { + 'type': 'object', + 'additionalProperties': False, + 'required': ['default'], + 'properties': { + 'sky-serve-controller': { + 'type': 'string', + }, + 'default': { + 'type': 'string' + } + } + }] }, **_LABELS_SCHEMA, **_NETWORK_CONFIG_SCHEMA, From 6d89ce99c547763643c439c7beaa4fa8c392c7a2 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:51:25 -0400 Subject: [PATCH 02/20] feat: refactor format of sg names --- sky/clouds/aws.py | 13 +++++-------- sky/utils/schemas.py | 33 ++++++++++++++++++++------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 8d140ab6711..554155964d6 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -1,5 +1,6 @@ """Amazon Web Services.""" import enum +import fnmatch import functools import json import os @@ -399,15 +400,11 @@ def make_deploy_resources_variables(self, user_security_group = skypilot_config.get_nested( ('aws', 'security_group_name'), None) - if user_security_group is not None and not isinstance( - user_security_group, str): - for sg_name in user_security_group: - if cluster_name_on_cloud.startswith( - sg_name) and sg_name != 'default': - user_security_group = user_security_group[sg_name] + if user_security_group is not None and not isinstance(user_security_group, str): + for profile in user_security_group: + if fnmatch.fnmatchcase(cluster_name_on_cloud, list(profile.keys())[0]): + user_security_group = list(profile.values())[0] break - elif sg_name == 'default': - user_security_group = user_security_group[sg_name] security_group = user_security_group if user_security_group is None and resources.ports is not None: # Already checked in Resources._try_validate_ports diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 7fe4689a6bc..49f5d7201e8 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -708,21 +708,28 @@ def get_config_schema(): 'additionalProperties': False, 'properties': { 'security_group_name': { - 'oneOf': [{ - 'type': 'string' - }, { - 'type': 'object', - 'additionalProperties': False, - 'required': ['default'], - 'properties': { - 'sky-serve-controller': { - 'type': 'string', + 'oneOf': [ + { + 'type': 'string' + }, + { + # A list of single-element dict to pretain the order. + # Example: + # security_group_name: + # - my-cluster1-*: my-security-group-1 + # - my-cluster2-*: my-security-group-2 + # - "*"": my-security-group-3 + 'type': 'array', + 'items': { + 'type': 'object', + 'additionalProperties': { + 'type': 'string' + }, + 'maxProperties': 1, + 'minProperties': 1, }, - 'default': { - 'type': 'string' - } } - }] + ] }, **_LABELS_SCHEMA, **_NETWORK_CONFIG_SCHEMA, From c4d500f71f1aa642fb6a3d8073e05e8267e74bba Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:51:25 -0400 Subject: [PATCH 03/20] refactor: update readme for security group name update --- docs/source/reference/config.rst | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index cbcc85a4f50..bd64c7c051b 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -158,11 +158,27 @@ Available fields and semantics: # Security group (optional). # - # The name of the security group to use for all instances. If not specified, + # Security group name to use for AWS instances. If not specified, # SkyPilot will use the default name for the security group: sky-sg- # Note: please ensure the security group name specified exists in the # regions the instances are going to be launched or the AWS account has the # permission to create a security group. + # + # Some example use cases are shown below. All fields are optional. + # - : apply the service account with the specified name to all instances. + # Example: + # security_group_name: my-security-group + # - : A list of single-element dict mapping from the cluster name (pattern) + # to the security group name to use. The matching of the cluster name is done in the same order + # as the list. + # NOTE: If none of the wildcard expressions in the dict match the cluster name, SkyPilot will use the default + # security group name as mentioned above: sky-sg- + # To specify your default, use "*" as the wildcard expression. + # Example: + # security_group_name: + # - my-cluster-name: my-security-group-1 + # - sky-serve-controller-*: my-security-group-2 + # - "*": my-default-security-group security_group_name: my-security-group # Identity to use for AWS instances (optional). From 2be0d530659f3ee40f617babeab2c5c701a59bc9 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:51:25 -0400 Subject: [PATCH 04/20] fix: format --- sky/clouds/aws.py | 6 ++++-- sky/utils/schemas.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 554155964d6..6998527073b 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -400,9 +400,11 @@ def make_deploy_resources_variables(self, user_security_group = skypilot_config.get_nested( ('aws', 'security_group_name'), None) - if user_security_group is not None and not isinstance(user_security_group, str): + if user_security_group is not None and not isinstance( + user_security_group, str): for profile in user_security_group: - if fnmatch.fnmatchcase(cluster_name_on_cloud, list(profile.keys())[0]): + if fnmatch.fnmatchcase(cluster_name_on_cloud, + list(profile.keys())[0]): user_security_group = list(profile.values())[0] break security_group = user_security_group diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 49f5d7201e8..519918fa3e6 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -713,7 +713,8 @@ def get_config_schema(): 'type': 'string' }, { - # A list of single-element dict to pretain the order. + # A list of single-element dict to pretain the + # order. # Example: # security_group_name: # - my-cluster1-*: my-security-group-1 From 54b6091c1a15d593b938e8620fc150074287dc2d Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 05/20] refactor: use ClusterName data class --- sky/backends/backend_utils.py | 11 +++- sky/backends/cloud_vm_ray_backend.py | 16 ++--- sky/cli.py | 2 +- sky/clouds/aws.py | 33 ++++++----- sky/clouds/azure.py | 15 ++--- sky/clouds/cloud.py | 6 +- sky/clouds/cudo.py | 4 +- sky/clouds/fluidstack.py | 5 +- sky/clouds/gcp.py | 25 ++++---- sky/clouds/ibm.py | 4 +- sky/clouds/kubernetes.py | 4 +- sky/clouds/lambda_cloud.py | 4 +- sky/clouds/oci.py | 4 +- sky/clouds/paperspace.py | 4 +- sky/clouds/runpod.py | 4 +- sky/clouds/scp.py | 4 +- sky/clouds/vsphere.py | 4 +- sky/provision/provisioner.py | 27 +++------ sky/resources.py | 18 +++++- sky/utils/resources_utils.py | 13 ++++ sky/utils/schemas.py | 88 +++++++++++----------------- 21 files changed, 154 insertions(+), 141 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index b80cf667413..854c1baced8 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -155,7 +155,11 @@ # we need to take this field from the new yaml. ('provider', 'tpu_node'), ('provider', 'security_group', 'GroupName'), + ('available_node_types', 'ray.head.default', 'node_config', + 'IamInstanceProfile'), ('available_node_types', 'ray.head.default', 'node_config', 'UserData'), + ('available_node_types', 'ray.worker.default', 'node_config', + 'IamInstanceProfile'), ('available_node_types', 'ray.worker.default', 'node_config', 'UserData'), ] @@ -793,8 +797,11 @@ def write_cluster_config( # move the check out of this function, i.e. the caller should be responsible # for the validation. # TODO(tian): Move more cloud agnostic vars to resources.py. - resources_vars = to_provision.make_deploy_variables(cluster_name_on_cloud, - region, zones, dryrun) + resources_vars = to_provision.make_deploy_variables( + resources_utils.ClusterName( + cluster_name, + cluster_name_on_cloud, + ), region, zones, dryrun) config_dict = {} specific_reservations = set( diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 6fe4211f102..63a198bbf45 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1566,8 +1566,8 @@ def _retry_zones( to_provision.cloud, region, zones, - provisioner.ClusterName(cluster_name, - handle.cluster_name_on_cloud), + resources_utils.ClusterName( + cluster_name, handle.cluster_name_on_cloud), num_nodes=num_nodes, cluster_yaml=handle.cluster_yaml, prev_cluster_ever_up=prev_cluster_ever_up, @@ -1577,8 +1577,10 @@ def _retry_zones( # caller. resources_vars = ( to_provision.cloud.make_deploy_resources_variables( - to_provision, handle.cluster_name_on_cloud, region, - zones)) + to_provision, + resources_utils.ClusterName( + cluster_name, handle.cluster_name_on_cloud), + region, zones)) config_dict['provision_record'] = provision_record config_dict['resources_vars'] = resources_vars config_dict['handle'] = handle @@ -2898,8 +2900,8 @@ def _provision( # 4. Starting ray cluster and skylet. cluster_info = provisioner.post_provision_runtime_setup( repr(handle.launched_resources.cloud), - provisioner.ClusterName(handle.cluster_name, - handle.cluster_name_on_cloud), + resources_utils.ClusterName(handle.cluster_name, + handle.cluster_name_on_cloud), handle.cluster_yaml, provision_record=provision_record, custom_resource=resources_vars.get('custom_resources'), @@ -3877,7 +3879,7 @@ def teardown_no_lock(self, try: provisioner.teardown_cluster(repr(cloud), - provisioner.ClusterName( + resources_utils.ClusterName( cluster_name, cluster_name_on_cloud), terminate=terminate, diff --git a/sky/cli.py b/sky/cli.py index db5291d949c..3717138f80b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3868,7 +3868,7 @@ def _generate_task_with_service( env: List[Tuple[str, str]], gpus: Optional[str], instance_type: Optional[str], - ports: Tuple[str], + ports: Optional[Tuple[str]], cpus: Optional[str], memory: Optional[str], disk_size: Optional[int], diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 6998527073b..f890091b070 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -371,12 +371,13 @@ def get_vcpus_mem_from_instance_type( return service_catalog.get_vcpus_mem_from_instance_type(instance_type, clouds='aws') - def make_deploy_resources_variables(self, - resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, - region: 'clouds.Region', - zones: Optional[List['clouds.Zone']], - dryrun: bool = False) -> Dict[str, Any]: + def make_deploy_resources_variables( + self, + resources: 'resources_lib.Resources', + cluster_name: resources_utils.ClusterName, + region: 'clouds.Region', + zones: Optional[List['clouds.Zone']], + dryrun: bool = False) -> Dict[str, Any]: del dryrun # unused assert zones is not None, (region, zones) @@ -403,7 +404,7 @@ def make_deploy_resources_variables(self, if user_security_group is not None and not isinstance( user_security_group, str): for profile in user_security_group: - if fnmatch.fnmatchcase(cluster_name_on_cloud, + if fnmatch.fnmatchcase(cluster_name.name_on_cloud, list(profile.keys())[0]): user_security_group = list(profile.values())[0] break @@ -411,7 +412,7 @@ def make_deploy_resources_variables(self, if user_security_group is None and resources.ports is not None: # Already checked in Resources._try_validate_ports security_group = USER_PORTS_SECURITY_GROUP_NAME.format( - cluster_name_on_cloud) + cluster_name.name_on_cloud) elif user_security_group is None: security_group = DEFAULT_SECURITY_GROUP_NAME @@ -845,22 +846,24 @@ def query_status(cls, name: str, tag_filters: Dict[str, str], assert False, 'This code path should not be used.' @classmethod - def create_image_from_cluster(cls, cluster_name: str, - cluster_name_on_cloud: str, + def create_image_from_cluster(cls, + cluster_name: resources_utils.ClusterName, region: Optional[str], zone: Optional[str]) -> str: - assert region is not None, (cluster_name, cluster_name_on_cloud, region) + assert region is not None, (cluster_name.display_name, + cluster_name.name_on_cloud, region) del zone # unused - image_name = f'skypilot-{cluster_name}-{int(time.time())}' + image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}' - status = provision_lib.query_instances('AWS', cluster_name_on_cloud, + status = provision_lib.query_instances('AWS', + cluster_name.name_on_cloud, {'region': region}) instance_ids = list(status.keys()) if not instance_ids: with ux_utils.print_exception_no_traceback(): raise RuntimeError( - f'Failed to find the source cluster {cluster_name!r} on ' + f'Failed to find the source cluster {cluster_name.display_name!r} on ' 'AWS.') if len(instance_ids) != 1: @@ -887,7 +890,7 @@ def create_image_from_cluster(cls, cluster_name: str, stream_logs=True) rich_utils.force_update_status( - f'Waiting for the source image {cluster_name!r} from {region} to be available on AWS.' + f'Waiting for the source image {cluster_name.display_name!r} from {region} to be available on AWS.' ) # Wait for the image to be available wait_image_cmd = ( diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index 916a1c01c7d..c2a3f3eb071 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -269,12 +269,13 @@ def get_vcpus_mem_from_instance_type( def get_zone_shell_cmd(cls) -> Optional[str]: return None - def make_deploy_resources_variables(self, - resources: 'resources.Resources', - cluster_name_on_cloud: str, - region: 'clouds.Region', - zones: Optional[List['clouds.Zone']], - dryrun: bool = False) -> Dict[str, Any]: + def make_deploy_resources_variables( + self, + resources: 'resources.Resources', + cluster_name: resources_utils.ClusterName, + region: 'clouds.Region', + zones: Optional[List['clouds.Zone']], + dryrun: bool = False) -> Dict[str, Any]: assert zones is None, ('Azure does not support zones', zones) region_name = region.name @@ -374,7 +375,7 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: 'disk_tier': Azure._get_disk_type(_failover_disk_tier()), 'cloud_init_setup_commands': cloud_init_setup_commands, 'azure_subscription_id': self.get_project_id(dryrun), - 'resource_group': f'{cluster_name_on_cloud}-{region_name}', + 'resource_group': f'{cluster_name.name_on_cloud}-{region_name}', } def _get_feasible_launchable_resources( diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index c5ff78e1c79..93048a84e74 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -253,7 +253,7 @@ def is_same_cloud(self, other: 'Cloud') -> bool: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'Region', zones: Optional[List['Zone']], dryrun: bool = False, @@ -726,8 +726,8 @@ def query_status(cls, name: str, tag_filters: Dict[str, str], # cloud._cloud_unsupported_features(). @classmethod - def create_image_from_cluster(cls, cluster_name: str, - cluster_name_on_cloud: str, + def create_image_from_cluster(cls, + cluster_name: resources_utils.ClusterName, region: Optional[str], zone: Optional[str]) -> str: """Creates an image from the cluster. diff --git a/sky/clouds/cudo.py b/sky/clouds/cudo.py index 3ad66306517..8f7d4eaf923 100644 --- a/sky/clouds/cudo.py +++ b/sky/clouds/cudo.py @@ -194,12 +194,12 @@ def get_zone_shell_cmd(cls) -> Optional[str]: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'clouds.Region', zones: Optional[List['clouds.Zone']], dryrun: bool = False, ) -> Dict[str, Optional[str]]: - del zones + del zones, cluster_name # unused r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) if acc_dict is not None: diff --git a/sky/clouds/fluidstack.py b/sky/clouds/fluidstack.py index d7921a3f51a..c4f15a0e510 100644 --- a/sky/clouds/fluidstack.py +++ b/sky/clouds/fluidstack.py @@ -10,6 +10,7 @@ from sky import status_lib from sky.clouds import service_catalog from sky.provision.fluidstack import fluidstack_utils +from sky.utils import resources_utils from sky.utils.resources_utils import DiskTier _CREDENTIAL_FILES = [ @@ -174,7 +175,7 @@ def get_zone_shell_cmd(cls) -> Optional[str]: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: clouds.Region, zones: Optional[List[clouds.Zone]], dryrun: bool = False, @@ -189,7 +190,7 @@ def make_deploy_resources_variables( else: custom_resources = None cuda_installation_commands = """ - sudo wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -O /usr/local/cuda-keyring_1.1-1_all.deb; + sudo wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -O /usr/local/cuda-keyring_1.1-1_all.deb; sudo dpkg -i /usr/local/cuda-keyring_1.1-1_all.deb; sudo apt-get update; sudo apt-get -y install cuda-toolkit-12-3; diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 86e9a90faf4..050fda07fe4 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -404,7 +404,7 @@ def get_default_instance_type( def make_deploy_resources_variables( self, resources: 'resources.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'clouds.Region', zones: Optional[List['clouds.Zone']], dryrun: bool = False) -> Dict[str, Optional[str]]: @@ -495,15 +495,15 @@ def make_deploy_resources_variables( firewall_rule = None if resources.ports is not None: - firewall_rule = ( - USER_PORTS_FIREWALL_RULE_NAME.format(cluster_name_on_cloud)) + firewall_rule = (USER_PORTS_FIREWALL_RULE_NAME.format( + cluster_name.name_on_cloud)) resources_vars['firewall_rule'] = firewall_rule # For TPU nodes. TPU VMs do not need TPU_NAME. tpu_node_name = resources_vars.get('tpu_node_name') if gcp_utils.is_tpu(resources) and not gcp_utils.is_tpu_vm(resources): if tpu_node_name is None: - tpu_node_name = cluster_name_on_cloud + tpu_node_name = cluster_name.name_on_cloud resources_vars['tpu_node_name'] = tpu_node_name @@ -1005,8 +1005,8 @@ def query_status(cls, name: str, tag_filters: Dict[str, str], assert False, 'This code path should not be used.' @classmethod - def create_image_from_cluster(cls, cluster_name: str, - cluster_name_on_cloud: str, + def create_image_from_cluster(cls, + cluster_name: resources_utils.ClusterName, region: Optional[str], zone: Optional[str]) -> str: del region # unused @@ -1015,7 +1015,7 @@ def create_image_from_cluster(cls, cluster_name: str, # `ray-cluster-name` tag, which is guaranteed by the current `ray` # backend. Once the `provision.query_instances` is implemented for GCP, # we should be able to get rid of this assumption. - tag_filters = {'ray-cluster-name': cluster_name_on_cloud} + tag_filters = {'ray-cluster-name': cluster_name.name_on_cloud} label_filter_str = cls._label_filter_str(tag_filters) instance_name_cmd = ('gcloud compute instances list ' f'--filter="({label_filter_str})" ' @@ -1027,7 +1027,8 @@ def create_image_from_cluster(cls, cluster_name: str, subprocess_utils.handle_returncode( returncode, instance_name_cmd, - error_msg=f'Failed to get instance name for {cluster_name!r}', + error_msg= + f'Failed to get instance name for {cluster_name.display_name!r}', stderr=stderr, stream_logs=True) instance_names = json.loads(stdout) @@ -1038,7 +1039,7 @@ def create_image_from_cluster(cls, cluster_name: str, f'instance, but got: {instance_names}') instance_name = instance_names[0]['name'] - image_name = f'skypilot-{cluster_name}-{int(time.time())}' + image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}' create_image_cmd = (f'gcloud compute images create {image_name} ' f'--source-disk {instance_name} ' f'--source-disk-zone {zone}') @@ -1050,7 +1051,8 @@ def create_image_from_cluster(cls, cluster_name: str, subprocess_utils.handle_returncode( returncode, create_image_cmd, - error_msg=f'Failed to create image for {cluster_name!r}', + error_msg= + f'Failed to create image for {cluster_name.display_name!r}', stderr=stderr, stream_logs=True) @@ -1064,7 +1066,8 @@ def create_image_from_cluster(cls, cluster_name: str, subprocess_utils.handle_returncode( returncode, image_uri_cmd, - error_msg=f'Failed to get image uri for {cluster_name!r}', + error_msg= + f'Failed to get image uri for {cluster_name.display_name!r}', stderr=stderr, stream_logs=True) diff --git a/sky/clouds/ibm.py b/sky/clouds/ibm.py index 86e325a437b..e468fecf00f 100644 --- a/sky/clouds/ibm.py +++ b/sky/clouds/ibm.py @@ -168,7 +168,7 @@ def get_egress_cost(self, num_gigabytes: float): def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'clouds.Region', zones: Optional[List['clouds.Zone']], dryrun: bool = False, @@ -184,7 +184,7 @@ def make_deploy_resources_variables( Returns: A dictionary of cloud-specific node type variables. """ - del cluster_name_on_cloud, dryrun # Unused. + del cluster_name, dryrun # Unused. def _get_profile_resources(instance_profile): """returns a dict representing the diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 78471e0de9f..113774142c9 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -224,11 +224,11 @@ def get_image_size(cls, image_id: str, region: Optional[str]) -> int: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: Optional['clouds.Region'], zones: Optional[List['clouds.Zone']], dryrun: bool = False) -> Dict[str, Optional[str]]: - del cluster_name_on_cloud, zones, dryrun # Unused. + del cluster_name, zones, dryrun # Unused. if region is None: region = self._regions[0] diff --git a/sky/clouds/lambda_cloud.py b/sky/clouds/lambda_cloud.py index 979b4833354..036f5a23979 100644 --- a/sky/clouds/lambda_cloud.py +++ b/sky/clouds/lambda_cloud.py @@ -156,11 +156,11 @@ def get_zone_shell_cmd(cls) -> Optional[str]: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'clouds.Region', zones: Optional[List['clouds.Zone']], dryrun: bool = False) -> Dict[str, Optional[str]]: - del cluster_name_on_cloud, dryrun # Unused. + del cluster_name, dryrun # Unused. assert zones is None, 'Lambda does not support zones.' r = resources diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index 5fb0111bf01..a911c3f38d0 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -187,11 +187,11 @@ def get_zone_shell_cmd(cls) -> Optional[str]: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: Optional['clouds.Region'], zones: Optional[List['clouds.Zone']], dryrun: bool = False) -> Dict[str, Optional[str]]: - del cluster_name_on_cloud, dryrun # Unused. + del cluster_name, dryrun # Unused. assert region is not None, resources acc_dict = self.get_accelerators_from_instance_type( diff --git a/sky/clouds/paperspace.py b/sky/clouds/paperspace.py index f67a9a27176..efa1afee781 100644 --- a/sky/clouds/paperspace.py +++ b/sky/clouds/paperspace.py @@ -173,11 +173,11 @@ def get_zone_shell_cmd(cls) -> Optional[str]: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'clouds.Region', zones: Optional[List['clouds.Zone']], dryrun: bool = False) -> Dict[str, Optional[str]]: - del zones, dryrun + del zones, dryrun, cluster_name r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) diff --git a/sky/clouds/runpod.py b/sky/clouds/runpod.py index c7a24e274dd..3486330b8b3 100644 --- a/sky/clouds/runpod.py +++ b/sky/clouds/runpod.py @@ -166,11 +166,11 @@ def get_zone_shell_cmd(cls) -> Optional[str]: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'clouds.Region', zones: Optional[List['clouds.Zone']], dryrun: bool = False) -> Dict[str, Optional[str]]: - del zones, dryrun # unused + del zones, dryrun, cluster_name # unused r = resources acc_dict = self.get_accelerators_from_instance_type(r.instance_type) diff --git a/sky/clouds/scp.py b/sky/clouds/scp.py index 6a3daf2712a..da45a7e143e 100644 --- a/sky/clouds/scp.py +++ b/sky/clouds/scp.py @@ -179,11 +179,11 @@ def get_zone_shell_cmd(cls) -> Optional[str]: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'clouds.Region', zones: Optional[List['clouds.Zone']], dryrun: bool = False) -> Dict[str, Optional[str]]: - del cluster_name_on_cloud, dryrun # Unused. + del cluster_name, dryrun # Unused. assert zones is None, 'SCP does not support zones.' r = resources diff --git a/sky/clouds/vsphere.py b/sky/clouds/vsphere.py index 872b8df9d70..968368ff0aa 100644 --- a/sky/clouds/vsphere.py +++ b/sky/clouds/vsphere.py @@ -171,13 +171,13 @@ def get_zone_shell_cmd(cls) -> Optional[str]: def make_deploy_resources_variables( self, resources: 'resources_lib.Resources', - cluster_name_on_cloud: str, + cluster_name: resources_utils.ClusterName, region: 'clouds.Region', zones: Optional[List['clouds.Zone']], dryrun: bool = False, ) -> Dict[str, Optional[str]]: # TODO get image id here. - del cluster_name_on_cloud, dryrun # unused + del cluster_name, dryrun # unused assert zones is not None, (region, zones) zone_names = [zone.name for zone in zones] r = resources diff --git a/sky/provision/provisioner.py b/sky/provision/provisioner.py index df9a9fcc58a..6e3886828e5 100644 --- a/sky/provision/provisioner.py +++ b/sky/provision/provisioner.py @@ -25,6 +25,7 @@ from sky.provision import metadata_utils from sky.skylet import constants from sky.utils import common_utils +from sky.utils import resources_utils from sky.utils import rich_utils from sky.utils import ux_utils @@ -38,23 +39,11 @@ _TITLE = '\n\n' + '=' * 20 + ' {} ' + '=' * 20 + '\n' -@dataclasses.dataclass -class ClusterName: - display_name: str - name_on_cloud: str - - def __repr__(self) -> str: - return repr(self.display_name) - - def __str__(self) -> str: - return self.display_name - - def _bulk_provision( cloud: clouds.Cloud, region: clouds.Region, zones: Optional[List[clouds.Zone]], - cluster_name: ClusterName, + cluster_name: resources_utils.ClusterName, bootstrap_config: provision_common.ProvisionConfig, ) -> provision_common.ProvisionRecord: provider_name = repr(cloud) @@ -135,7 +124,7 @@ def bulk_provision( cloud: clouds.Cloud, region: clouds.Region, zones: Optional[List[clouds.Zone]], - cluster_name: ClusterName, + cluster_name: resources_utils.ClusterName, num_nodes: int, cluster_yaml: str, prev_cluster_ever_up: bool, @@ -225,7 +214,7 @@ def bulk_provision( raise -def teardown_cluster(cloud_name: str, cluster_name: ClusterName, +def teardown_cluster(cloud_name: str, cluster_name: resources_utils.ClusterName, terminate: bool, provider_config: Dict) -> None: """Deleting or stopping a cluster. @@ -411,8 +400,8 @@ def wait_for_ssh(cluster_info: provision_common.ClusterInfo, def _post_provision_setup( - cloud_name: str, cluster_name: ClusterName, cluster_yaml: str, - provision_record: provision_common.ProvisionRecord, + cloud_name: str, cluster_name: resources_utils.ClusterName, + cluster_yaml: str, provision_record: provision_common.ProvisionRecord, custom_resource: Optional[str]) -> provision_common.ClusterInfo: config_from_yaml = common_utils.read_yaml(cluster_yaml) provider_config = config_from_yaml.get('provider') @@ -563,8 +552,8 @@ def _post_provision_setup( def post_provision_runtime_setup( - cloud_name: str, cluster_name: ClusterName, cluster_yaml: str, - provision_record: provision_common.ProvisionRecord, + cloud_name: str, cluster_name: resources_utils.ClusterName, + cluster_yaml: str, provision_record: provision_common.ProvisionRecord, custom_resource: Optional[str], log_dir: str) -> provision_common.ClusterInfo: """Run internal setup commands after provisioning and before user setup. diff --git a/sky/resources.py b/sky/resources.py index adbcf745f89..35c916a73c6 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -929,6 +929,20 @@ def _try_validate_ports(self) -> None: """ if self.ports is None: return + if self.cloud is None or isinstance(self.cloud, clouds.AWS): + security_group_name = skypilot_config.get_nested( + ('aws', 'security_group_name'), None) + if security_group_name is not None: + with ux_utils.print_exception_no_traceback(): + logger.warning( + f'Ports {self.ports} and security group name are ' + f'specified: {security_group_name}. It is not ' + 'guaranteed that the ports will be opened if the ' + 'specified security group is not correctly set up. ' + 'Please try to specify `ports` only and leave out ' + '`aws.security_group_name` in `~/.sky/config.yaml` to ' + 'allow SkyPilot to automatically create and configure ' + 'the security group.') if self.cloud is not None: self.cloud.check_features_are_supported( self, {clouds.CloudImplementationFeatures.OPEN_PORTS}) @@ -1003,7 +1017,7 @@ def get_accelerators_str(self) -> str: def get_spot_str(self) -> str: return '[Spot]' if self.use_spot else '' - def make_deploy_variables(self, cluster_name_on_cloud: str, + def make_deploy_variables(self, cluster_name: resources_utils.ClusterName, region: clouds.Region, zones: Optional[List[clouds.Zone]], dryrun: bool) -> Dict[str, Optional[str]]: @@ -1041,7 +1055,7 @@ def make_deploy_variables(self, cluster_name_on_cloud: str, # Cloud specific variables cloud_specific_variables = self.cloud.make_deploy_resources_variables( - self, cluster_name_on_cloud, region, zones, dryrun) + self, cluster_name, region, zones, dryrun) return dict( cloud_specific_variables, **{ diff --git a/sky/utils/resources_utils.py b/sky/utils/resources_utils.py index e2357b9eeb7..87a62dab95b 100644 --- a/sky/utils/resources_utils.py +++ b/sky/utils/resources_utils.py @@ -1,4 +1,5 @@ """Utility functions for resources.""" +import dataclasses import enum import itertools import re @@ -43,6 +44,18 @@ def __le__(self, other: 'DiskTier') -> bool: return types.index(self) <= types.index(other) +@dataclasses.dataclass +class ClusterName: + display_name: str + name_on_cloud: str + + def __repr__(self) -> str: + return repr(self.display_name) + + def __str__(self) -> str: + return self.display_name + + def check_port_str(port: str) -> None: if not port.isdigit(): with ux_utils.print_exception_no_traceback(): diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 519918fa3e6..9f29797c4f9 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -33,6 +33,36 @@ def _check_not_both_fields_present(field1: str, field2: str): } +def _get_cloud_name_property_mapping(field: str): + return { + field: { + 'oneOf': [ + { + 'type': 'string' + }, + { + # A list of single-element dict to pretain the + # order. + # Example: + # property_name: + # - my-cluster1-*: my-property-1 + # - my-cluster2-*: my-property-2 + # - "*"": my-property-3 + 'type': 'array', + 'items': { + 'type': 'object', + 'additionalProperties': { + 'type': 'string' + }, + 'maxProperties': 1, + 'minProperties': 1, + }, + } + ] + } + } + + def _get_single_resources_schema(): """Schema for a single resource in a resources list.""" # To avoid circular imports, only import when needed. @@ -640,33 +670,6 @@ def get_default_remote_identity(cloud: str) -> str: } } -_REMOTE_IDENTITY_SCHEMA_AWS = { - 'remote_identity': { - 'oneOf': [ - { - 'type': 'string' - }, - { - # A list of single-element dict to pretain the order. - # Example: - # remote_identity: - # - my-cluster1-*: my-iam-role-1 - # - my-cluster2-*: my-iam-role-2 - # - "*"": my-iam-role-3 - 'type': 'array', - 'items': { - 'type': 'object', - 'additionalProperties': { - 'type': 'string' - }, - 'maxProperties': 1, - 'minProperties': 1, - }, - } - ] - } -} - _REMOTE_IDENTITY_SCHEMA_KUBERNETES = { 'remote_identity': { 'type': 'string' @@ -685,7 +688,7 @@ def get_config_schema(): # Validation may fail if $schema is included. if k != '$schema' } - resources_schema['properties'].pop('port', None) + resources_schema['properties'].pop('ports') controller_resources_schema = { 'type': 'object', 'required': [], @@ -707,31 +710,7 @@ def get_config_schema(): 'required': [], 'additionalProperties': False, 'properties': { - 'security_group_name': { - 'oneOf': [ - { - 'type': 'string' - }, - { - # A list of single-element dict to pretain the - # order. - # Example: - # security_group_name: - # - my-cluster1-*: my-security-group-1 - # - my-cluster2-*: my-security-group-2 - # - "*"": my-security-group-3 - 'type': 'array', - 'items': { - 'type': 'object', - 'additionalProperties': { - 'type': 'string' - }, - 'maxProperties': 1, - 'minProperties': 1, - }, - } - ] - }, + **_get_cloud_name_property_mapping('security_group_name'), **_LABELS_SCHEMA, **_NETWORK_CONFIG_SCHEMA, }, @@ -890,7 +869,8 @@ def get_config_schema(): for cloud, config in cloud_configs.items(): if cloud == 'aws': - config['properties'].update(_REMOTE_IDENTITY_SCHEMA_AWS) + config['properties'].update( + _get_cloud_name_property_mapping('remote_identity')) elif cloud == 'kubernetes': config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES) else: From 05822264d81d60c7fe52256af17bcb22822e5d33 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 06/20] fix: move warning --- sky/clouds/aws.py | 11 +++++++++++ sky/resources.py | 14 -------------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index f890091b070..addbc3f3aa5 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -415,6 +415,17 @@ def make_deploy_resources_variables( cluster_name.name_on_cloud) elif user_security_group is None: security_group = DEFAULT_SECURITY_GROUP_NAME + elif user_security_group is not None and resources.ports is not None: + with ux_utils.print_exception_no_traceback(): + logger.warning( + f'Ports {resources.ports} and security group name are ' + f'specified: {user_security_group}. It is not ' + 'guaranteed that the ports will be opened if the ' + 'specified security group is not correctly set up. ' + 'Please try to specify `ports` only and leave out ' + '`aws.security_group_name` in `~/.sky/config.yaml` to ' + 'allow SkyPilot to automatically create and configure ' + 'the security group.') return { 'instance_type': r.instance_type, diff --git a/sky/resources.py b/sky/resources.py index 35c916a73c6..f0cb1abda1e 100644 --- a/sky/resources.py +++ b/sky/resources.py @@ -929,20 +929,6 @@ def _try_validate_ports(self) -> None: """ if self.ports is None: return - if self.cloud is None or isinstance(self.cloud, clouds.AWS): - security_group_name = skypilot_config.get_nested( - ('aws', 'security_group_name'), None) - if security_group_name is not None: - with ux_utils.print_exception_no_traceback(): - logger.warning( - f'Ports {self.ports} and security group name are ' - f'specified: {security_group_name}. It is not ' - 'guaranteed that the ports will be opened if the ' - 'specified security group is not correctly set up. ' - 'Please try to specify `ports` only and leave out ' - '`aws.security_group_name` in `~/.sky/config.yaml` to ' - 'allow SkyPilot to automatically create and configure ' - 'the security group.') if self.cloud is not None: self.cloud.check_features_are_supported( self, {clouds.CloudImplementationFeatures.OPEN_PORTS}) From 9e9e5a66b7954e7b043cd3411e33684fca0156ed Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 07/20] fix: clean code --- sky/clouds/aws.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index addbc3f3aa5..8db561946ea 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -409,12 +409,12 @@ def make_deploy_resources_variables( user_security_group = list(profile.values())[0] break security_group = user_security_group - if user_security_group is None and resources.ports is not None: - # Already checked in Resources._try_validate_ports - security_group = USER_PORTS_SECURITY_GROUP_NAME.format( - cluster_name.name_on_cloud) - elif user_security_group is None: + if security_group is None: security_group = DEFAULT_SECURITY_GROUP_NAME + if resources.ports is not None: + # Already checked in Resources._try_validate_ports + security_group = USER_PORTS_SECURITY_GROUP_NAME.format( + cluster_name.name_on_cloud) elif user_security_group is not None and resources.ports is not None: with ux_utils.print_exception_no_traceback(): logger.warning( From a990eec580b41f909f72fa7a019cd6acc15a1dca Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 08/20] fix: clean code --- sky/clouds/aws.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 8db561946ea..967214998aa 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -415,11 +415,11 @@ def make_deploy_resources_variables( # Already checked in Resources._try_validate_ports security_group = USER_PORTS_SECURITY_GROUP_NAME.format( cluster_name.name_on_cloud) - elif user_security_group is not None and resources.ports is not None: + elif security_group is not None and resources.ports is not None: with ux_utils.print_exception_no_traceback(): logger.warning( f'Ports {resources.ports} and security group name are ' - f'specified: {user_security_group}. It is not ' + f'specified: {security_group}. It is not ' 'guaranteed that the ports will be opened if the ' 'specified security group is not correctly set up. ' 'Please try to specify `ports` only and leave out ' From db2b69a168783efe0593e7a1816bb569d68159a8 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 09/20] fix: schema constant --- sky/utils/schemas.py | 63 +++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 9f29797c4f9..71829ea1de5 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -33,36 +33,6 @@ def _check_not_both_fields_present(field1: str, field2: str): } -def _get_cloud_name_property_mapping(field: str): - return { - field: { - 'oneOf': [ - { - 'type': 'string' - }, - { - # A list of single-element dict to pretain the - # order. - # Example: - # property_name: - # - my-cluster1-*: my-property-1 - # - my-cluster2-*: my-property-2 - # - "*"": my-property-3 - 'type': 'array', - 'items': { - 'type': 'object', - 'additionalProperties': { - 'type': 'string' - }, - 'maxProperties': 1, - 'minProperties': 1, - }, - } - ] - } - } - - def _get_single_resources_schema(): """Schema for a single resource in a resources list.""" # To avoid circular imports, only import when needed. @@ -642,6 +612,32 @@ def get_cluster_schema(): } } +_PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY = { + 'oneOf': [ + { + 'type': 'string' + }, + { + # A list of single-element dict to pretain the + # order. + # Example: + # property_name: + # - my-cluster1-*: my-property-1 + # - my-cluster2-*: my-property-2 + # - "*"": my-property-3 + 'type': 'array', + 'items': { + 'type': 'object', + 'additionalProperties': { + 'type': 'string' + }, + 'maxProperties': 1, + 'minProperties': 1, + }, + } + ] +} + class RemoteIdentityOptions(enum.Enum): """Enum for remote identity types. @@ -710,7 +706,7 @@ def get_config_schema(): 'required': [], 'additionalProperties': False, 'properties': { - **_get_cloud_name_property_mapping('security_group_name'), + 'security_group_name': _PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY, **_LABELS_SCHEMA, **_NETWORK_CONFIG_SCHEMA, }, @@ -869,8 +865,9 @@ def get_config_schema(): for cloud, config in cloud_configs.items(): if cloud == 'aws': - config['properties'].update( - _get_cloud_name_property_mapping('remote_identity')) + config['properties'].update({ + 'remote_identity': _PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY + }) elif cloud == 'kubernetes': config['properties'].update(_REMOTE_IDENTITY_SCHEMA_KUBERNETES) else: From 1da54289374cb6eb7f5e85a79027ed5dc06aa236 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 10/20] refactor: add sg test --- tests/test_yamls/test_aws_config.yaml | 11 ++++ tests/unit_tests/test_resources.py | 82 +++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 tests/test_yamls/test_aws_config.yaml diff --git a/tests/test_yamls/test_aws_config.yaml b/tests/test_yamls/test_aws_config.yaml new file mode 100644 index 00000000000..27ac4d4f555 --- /dev/null +++ b/tests/test_yamls/test_aws_config.yaml @@ -0,0 +1,11 @@ +aws: + vpc_name: fake-vpc + remote_identity: + - sky-serve-fake1-*: fake1-skypilot-role + - sky-serve-fake2-*: fake1-skypilot-role + - "*": fake-skypilot-default-role + + security_group_name: + - sky-serve-fake1-*: fake-1-sg + - sky-serve-fake2-*: fake-2-sg + - "*": fake-skypilot-default diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py index f9e3ad51630..097c69d6400 100644 --- a/tests/unit_tests/test_resources.py +++ b/tests/unit_tests/test_resources.py @@ -1,10 +1,13 @@ from typing import Dict from unittest.mock import Mock +from unittest.mock import patch import pytest from sky import clouds +from sky import skypilot_config from sky.resources import Resources +from sky.utils import resources_utils GLOBAL_VALID_LABELS = { 'plaintext': 'plainvalue', @@ -86,3 +89,82 @@ def test_kubernetes_labels_resources(): } cloud = clouds.Kubernetes() _run_label_test(allowed_labels, invalid_labels, cloud) + + +@patch.object(skypilot_config, 'CONFIG_PATH', + './tests/test_yamls/test_aws_config.yaml') +@patch.object(skypilot_config, '_dict', None) +@patch.object(skypilot_config, '_loaded_config_path', None) +@patch("sky.clouds.service_catalog.instance_type_exists", return_value=True) +@patch("sky.clouds.service_catalog.get_accelerators_from_instance_type", + return_value={"fake-acc": 2}) +@patch("sky.clouds.service_catalog.get_image_id_from_tag", + return_value="fake-image") +def test_aws_make_deploy_variables(*mocks) -> None: + skypilot_config._try_load_config() + + cloud = clouds.AWS() + cluster_name = resources_utils.ClusterName(display_name='display', + name_on_cloud='cloud') + region = clouds.Region(name='fake-region') + zones = [clouds.Zone(name='fake-zone')] + resource = Resources(cloud=cloud, instance_type="fake-type: 3") + config = resource.make_deploy_variables(cluster_name, + region, + zones, + dryrun=True) + + expected_config_base = { + 'instance_type': resource.instance_type, + 'custom_resources': '{"fake-acc":2}', + 'use_spot': False, + 'region': 'fake-region', + 'image_id': 'fake-image', + 'disk_tier': 'gp3', + 'disk_throughput': 218, + 'disk_iops': 3500, + 'custom_disk_perf': True, + 'docker_image': None, + 'docker_container_name': 'sky_container', + 'docker_login_config': None, + 'zones': 'fake-zone' + } + + # test using defaults + expected_config = expected_config_base.copy() + expected_config.update({ + 'security_group': "fake-skypilot-default", + 'security_group_managed_by_skypilot': 'false' + }) + assert config == expected_config, ('unexpected resource ' + 'variables generated') + + # test using culuster matches regex, top + cluster_name = resources_utils.ClusterName( + display_name='display', name_on_cloud='sky-serve-fake1-1234') + expected_config = expected_config_base.copy() + expected_config.update({ + 'security_group': "fake-1-sg", + 'security_group_managed_by_skypilot': 'false' + }) + config = resource.make_deploy_variables(cluster_name, + region, + zones, + dryrun=True) + assert config == expected_config, ('unexpected resource ' + 'variables generated') + + # test using culuster matches regex, middle + cluster_name = resources_utils.ClusterName( + display_name='display', name_on_cloud='sky-serve-fake2-1234') + expected_config = expected_config_base.copy() + expected_config.update({ + 'security_group': "fake-2-sg", + 'security_group_managed_by_skypilot': 'false' + }) + config = resource.make_deploy_variables(cluster_name, + region, + zones, + dryrun=True) + assert config == expected_config, ('unexpected resource ' + 'variables generated') From fdeaa2d24587914ee43f593f301f6bab306e2b9b Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 11/20] fix: pylint --- sky/utils/schemas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 71829ea1de5..a7eb148c516 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -706,7 +706,8 @@ def get_config_schema(): 'required': [], 'additionalProperties': False, 'properties': { - 'security_group_name': _PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY, + 'security_group_name': + (_PRORPERTY_NAME_OR_CLUSTER_NAME_TO_PROPERTY), **_LABELS_SCHEMA, **_NETWORK_CONFIG_SCHEMA, }, From f0572c3062b6e73df9e577fce277cccab0d47f8a Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 12/20] refactor: updates to use display name --- sky/clouds/aws.py | 30 +++++++++++++-------------- tests/test_yamls/test_aws_config.yaml | 2 -- tests/unit_tests/test_resources.py | 9 ++++---- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 967214998aa..4646b275e86 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -399,12 +399,15 @@ def make_deploy_resources_variables( image_id = self._get_image_id(image_id_to_use, region_name, r.instance_type) - user_security_group = skypilot_config.get_nested( + + user_security_group_config = skypilot_config.get_nested( ('aws', 'security_group_name'), None) - if user_security_group is not None and not isinstance( - user_security_group, str): - for profile in user_security_group: - if fnmatch.fnmatchcase(cluster_name.name_on_cloud, + user_security_group = None + if isinstance(user_security_group_config, str): + user_security_group = user_security_group_config + elif isinstance(user_security_group_config, list): + for profile in user_security_group_config: + if fnmatch.fnmatchcase(cluster_name.display_name, list(profile.keys())[0]): user_security_group = list(profile.values())[0] break @@ -414,18 +417,15 @@ def make_deploy_resources_variables( if resources.ports is not None: # Already checked in Resources._try_validate_ports security_group = USER_PORTS_SECURITY_GROUP_NAME.format( - cluster_name.name_on_cloud) - elif security_group is not None and resources.ports is not None: + cluster_name.display_name) + elif resources.ports is not None: with ux_utils.print_exception_no_traceback(): logger.warning( - f'Ports {resources.ports} and security group name are ' - f'specified: {security_group}. It is not ' - 'guaranteed that the ports will be opened if the ' - 'specified security group is not correctly set up. ' - 'Please try to specify `ports` only and leave out ' - '`aws.security_group_name` in `~/.sky/config.yaml` to ' - 'allow SkyPilot to automatically create and configure ' - 'the security group.') + f'Skip opening ports {resources.ports} for cluster {cluster_name!r}, ' + 'as `aws.security_group_name` in `~/.sky/config.yaml` is specified as ' + f' {security_group!r}. Please make sure the specified security group ' + 'has requested ports setup; or, leave out `aws.security_group_name` ' + 'in `~/.sky/config.yaml`.') return { 'instance_type': r.instance_type, diff --git a/tests/test_yamls/test_aws_config.yaml b/tests/test_yamls/test_aws_config.yaml index 27ac4d4f555..4cda26dc9b7 100644 --- a/tests/test_yamls/test_aws_config.yaml +++ b/tests/test_yamls/test_aws_config.yaml @@ -3,9 +3,7 @@ aws: remote_identity: - sky-serve-fake1-*: fake1-skypilot-role - sky-serve-fake2-*: fake1-skypilot-role - - "*": fake-skypilot-default-role security_group_name: - sky-serve-fake1-*: fake-1-sg - sky-serve-fake2-*: fake-2-sg - - "*": fake-skypilot-default diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py index 097c69d6400..20da1a85ce4 100644 --- a/tests/unit_tests/test_resources.py +++ b/tests/unit_tests/test_resources.py @@ -100,6 +100,7 @@ def test_kubernetes_labels_resources(): return_value={"fake-acc": 2}) @patch("sky.clouds.service_catalog.get_image_id_from_tag", return_value="fake-image") +@patch.object(clouds.aws, "DEFAULT_SECURITY_GROUP_NAME", "fake-default-sg") def test_aws_make_deploy_variables(*mocks) -> None: skypilot_config._try_load_config() @@ -133,15 +134,15 @@ def test_aws_make_deploy_variables(*mocks) -> None: # test using defaults expected_config = expected_config_base.copy() expected_config.update({ - 'security_group': "fake-skypilot-default", - 'security_group_managed_by_skypilot': 'false' + 'security_group': "fake-default-sg", + 'security_group_managed_by_skypilot': 'true' }) assert config == expected_config, ('unexpected resource ' 'variables generated') # test using culuster matches regex, top cluster_name = resources_utils.ClusterName( - display_name='display', name_on_cloud='sky-serve-fake1-1234') + display_name='sky-serve-fake1-1234', name_on_cloud='name-on-cloud') expected_config = expected_config_base.copy() expected_config.update({ 'security_group': "fake-1-sg", @@ -156,7 +157,7 @@ def test_aws_make_deploy_variables(*mocks) -> None: # test using culuster matches regex, middle cluster_name = resources_utils.ClusterName( - display_name='display', name_on_cloud='sky-serve-fake2-1234') + display_name='sky-serve-fake2-1234', name_on_cloud='name-on-cloud') expected_config = expected_config_base.copy() expected_config.update({ 'security_group': "fake-2-sg", From 42ec7c0155b6930b09a81b1e233baf5f307b59c8 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 13/20] fix: bug in remote identity and update tests --- sky/backends/backend_utils.py | 12 +-- tests/test_yamls/test_aws_config.yaml | 2 +- tests/unit_tests/test_backend_utils.py | 117 +++++++++++++++++++++++++ tests/unit_tests/test_resources.py | 24 ++--- 4 files changed, 137 insertions(+), 18 deletions(-) create mode 100644 tests/unit_tests/test_backend_utils.py diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 854c1baced8..92dee9f770b 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -810,11 +810,13 @@ def write_cluster_config( assert cluster_name is not None excluded_clouds = [] - remote_identity = skypilot_config.get_nested( - (str(cloud).lower(), 'remote_identity'), - schemas.get_default_remote_identity(str(cloud).lower())) - if remote_identity is not None and not isinstance(remote_identity, str): - for profile in remote_identity: + remote_identity_config = skypilot_config.get_nested( + (str(cloud).lower(), 'remote_identity'), None) + remote_identity = schemas.get_default_remote_identity(str(cloud).lower()) + if isinstance(remote_identity_config, str): + remote_identity = remote_identity_config + if isinstance(remote_identity_config, list): + for profile in remote_identity_config: if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]): remote_identity = list(profile.values())[0] break diff --git a/tests/test_yamls/test_aws_config.yaml b/tests/test_yamls/test_aws_config.yaml index 4cda26dc9b7..047334703c1 100644 --- a/tests/test_yamls/test_aws_config.yaml +++ b/tests/test_yamls/test_aws_config.yaml @@ -2,7 +2,7 @@ aws: vpc_name: fake-vpc remote_identity: - sky-serve-fake1-*: fake1-skypilot-role - - sky-serve-fake2-*: fake1-skypilot-role + - sky-serve-fake2-*: fake2-skypilot-role security_group_name: - sky-serve-fake1-*: fake-1-sg diff --git a/tests/unit_tests/test_backend_utils.py b/tests/unit_tests/test_backend_utils.py new file mode 100644 index 00000000000..ff786aac5f2 --- /dev/null +++ b/tests/unit_tests/test_backend_utils.py @@ -0,0 +1,117 @@ +import pathlib +from typing import Dict +from unittest.mock import Mock +from unittest.mock import patch + +import pytest + +from sky import clouds +from sky import skypilot_config +from sky.resources import Resources +from sky.resources import resources_utils +from sky.backends import backend_utils + + +@patch.object(skypilot_config, 'CONFIG_PATH', + './tests/test_yamls/test_aws_config.yaml') +@patch.object(skypilot_config, '_dict', None) +@patch.object(skypilot_config, '_loaded_config_path', None) +@patch('sky.clouds.service_catalog.instance_type_exists', return_value=True) +@patch('sky.clouds.service_catalog.get_accelerators_from_instance_type', + return_value={'fake-acc': 2}) +@patch('sky.clouds.service_catalog.get_image_id_from_tag', + return_value='fake-image') +@patch.object(clouds.aws, 'DEFAULT_SECURITY_GROUP_NAME', 'fake-default-sg') +@patch('sky.check.get_cloud_credential_file_mounts', return_value='~/.aws/credentials') +@patch('sky.backends.backend_utils._get_yaml_path_from_cluster_name', return_value='/tmp/fake/path') +@patch('sky.utils.common_utils.fill_template') +def test_write_cluster_config_w_remote_identity(mock_fill_template, *mocks) -> None: + skypilot_config._try_load_config() + + cloud = clouds.AWS() + + region = clouds.Region(name='fake-region') + zones = [clouds.Zone(name='fake-zone')] + resource = Resources(cloud=cloud, instance_type='fake-type: 3') + + cluster_config_template = 'aws-ray.yml.j2' + + # test default + backend_utils.write_cluster_config( + to_provision=resource, + num_nodes=2, + cluster_config_template=cluster_config_template, + cluster_name="display", + local_wheel_path=pathlib.Path('/tmp/fake'), + wheel_hash='b1bd84059bc0342f7843fcbe04ab563e', + region=region, + zones=zones, + dryrun=True, + keep_launch_fields_in_existing_config=True + ) + + expected_subset = { + 'instance_type': 'fake-type: 3', + 'custom_resources': '{"fake-acc":2}', + 'region': 'fake-region', + 'zones': 'fake-zone', + 'image_id': 'fake-image', + 'security_group': 'fake-default-sg', + 'security_group_managed_by_skypilot': 'true', + 'vpc_name': 'fake-vpc', + 'remote_identity': 'LOCAL_CREDENTIALS', # remote identity + 'sky_local_path': '/tmp/fake', + 'sky_wheel_hash': 'b1bd84059bc0342f7843fcbe04ab563e', + } + + mock_fill_template.assert_called_once() + assert mock_fill_template.call_args[0][0] == cluster_config_template, "config template incorrect" + assert mock_fill_template.call_args[0][1].items() >= expected_subset.items(), "config fill values incorrect" + + # test using cluster matches regex, top + mock_fill_template.reset_mock() + expected_subset.update({ + 'security_group': 'fake-1-sg', + 'security_group_managed_by_skypilot': 'false', + 'remote_identity': 'fake1-skypilot-role' + }) + backend_utils.write_cluster_config( + to_provision=resource, + num_nodes=2, + cluster_config_template=cluster_config_template, + cluster_name="sky-serve-fake1-1234", + local_wheel_path=pathlib.Path('/tmp/fake'), + wheel_hash='b1bd84059bc0342f7843fcbe04ab563e', + region=region, + zones=zones, + dryrun=True, + keep_launch_fields_in_existing_config=True + ) + + mock_fill_template.assert_called_once() + assert mock_fill_template.call_args[0][0] == cluster_config_template, "config template incorrect" + assert mock_fill_template.call_args[0][1].items() >= expected_subset.items(), "config fill values incorrect" + + # test using cluster matches regex, middle + mock_fill_template.reset_mock() + expected_subset.update({ + 'security_group': 'fake-2-sg', + 'security_group_managed_by_skypilot': 'false', + 'remote_identity': 'fake2-skypilot-role' + }) + backend_utils.write_cluster_config( + to_provision=resource, + num_nodes=2, + cluster_config_template=cluster_config_template, + cluster_name="sky-serve-fake2-1234", + local_wheel_path=pathlib.Path('/tmp/fake'), + wheel_hash='b1bd84059bc0342f7843fcbe04ab563e', + region=region, + zones=zones, + dryrun=True, + keep_launch_fields_in_existing_config=True + ) + + mock_fill_template.assert_called_once() + assert mock_fill_template.call_args[0][0] == cluster_config_template, "config template incorrect" + assert mock_fill_template.call_args[0][1].items() >= expected_subset.items(), "config fill values incorrect" diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py index 20da1a85ce4..d7643459ef0 100644 --- a/tests/unit_tests/test_resources.py +++ b/tests/unit_tests/test_resources.py @@ -95,12 +95,12 @@ def test_kubernetes_labels_resources(): './tests/test_yamls/test_aws_config.yaml') @patch.object(skypilot_config, '_dict', None) @patch.object(skypilot_config, '_loaded_config_path', None) -@patch("sky.clouds.service_catalog.instance_type_exists", return_value=True) -@patch("sky.clouds.service_catalog.get_accelerators_from_instance_type", - return_value={"fake-acc": 2}) -@patch("sky.clouds.service_catalog.get_image_id_from_tag", - return_value="fake-image") -@patch.object(clouds.aws, "DEFAULT_SECURITY_GROUP_NAME", "fake-default-sg") +@patch('sky.clouds.service_catalog.instance_type_exists', return_value=True) +@patch('sky.clouds.service_catalog.get_accelerators_from_instance_type', + return_value={'fake-acc': 2}) +@patch('sky.clouds.service_catalog.get_image_id_from_tag', + return_value='fake-image') +@patch.object(clouds.aws, 'DEFAULT_SECURITY_GROUP_NAME', 'fake-default-sg') def test_aws_make_deploy_variables(*mocks) -> None: skypilot_config._try_load_config() @@ -109,7 +109,7 @@ def test_aws_make_deploy_variables(*mocks) -> None: name_on_cloud='cloud') region = clouds.Region(name='fake-region') zones = [clouds.Zone(name='fake-zone')] - resource = Resources(cloud=cloud, instance_type="fake-type: 3") + resource = Resources(cloud=cloud, instance_type='fake-type: 3') config = resource.make_deploy_variables(cluster_name, region, zones, @@ -134,18 +134,18 @@ def test_aws_make_deploy_variables(*mocks) -> None: # test using defaults expected_config = expected_config_base.copy() expected_config.update({ - 'security_group': "fake-default-sg", + 'security_group': 'fake-default-sg', 'security_group_managed_by_skypilot': 'true' }) assert config == expected_config, ('unexpected resource ' 'variables generated') - # test using culuster matches regex, top + # test using cluster matches regex, top cluster_name = resources_utils.ClusterName( display_name='sky-serve-fake1-1234', name_on_cloud='name-on-cloud') expected_config = expected_config_base.copy() expected_config.update({ - 'security_group': "fake-1-sg", + 'security_group': 'fake-1-sg', 'security_group_managed_by_skypilot': 'false' }) config = resource.make_deploy_variables(cluster_name, @@ -155,12 +155,12 @@ def test_aws_make_deploy_variables(*mocks) -> None: assert config == expected_config, ('unexpected resource ' 'variables generated') - # test using culuster matches regex, middle + # test using cluster matches regex, middle cluster_name = resources_utils.ClusterName( display_name='sky-serve-fake2-1234', name_on_cloud='name-on-cloud') expected_config = expected_config_base.copy() expected_config.update({ - 'security_group': "fake-2-sg", + 'security_group': 'fake-2-sg', 'security_group_managed_by_skypilot': 'false' }) config = resource.make_deploy_variables(cluster_name, From d7a0004ac5e34c82074175fd3753469c3511db5f Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 14/20] fix: formatting --- tests/unit_tests/test_backend_utils.py | 38 +++++++++++++++----------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tests/unit_tests/test_backend_utils.py b/tests/unit_tests/test_backend_utils.py index ff786aac5f2..cb1b83f1999 100644 --- a/tests/unit_tests/test_backend_utils.py +++ b/tests/unit_tests/test_backend_utils.py @@ -7,9 +7,9 @@ from sky import clouds from sky import skypilot_config +from sky.backends import backend_utils from sky.resources import Resources from sky.resources import resources_utils -from sky.backends import backend_utils @patch.object(skypilot_config, 'CONFIG_PATH', @@ -22,10 +22,13 @@ @patch('sky.clouds.service_catalog.get_image_id_from_tag', return_value='fake-image') @patch.object(clouds.aws, 'DEFAULT_SECURITY_GROUP_NAME', 'fake-default-sg') -@patch('sky.check.get_cloud_credential_file_mounts', return_value='~/.aws/credentials') -@patch('sky.backends.backend_utils._get_yaml_path_from_cluster_name', return_value='/tmp/fake/path') +@patch('sky.check.get_cloud_credential_file_mounts', + return_value='~/.aws/credentials') +@patch('sky.backends.backend_utils._get_yaml_path_from_cluster_name', + return_value='/tmp/fake/path') @patch('sky.utils.common_utils.fill_template') -def test_write_cluster_config_w_remote_identity(mock_fill_template, *mocks) -> None: +def test_write_cluster_config_w_remote_identity(mock_fill_template, + *mocks) -> None: skypilot_config._try_load_config() cloud = clouds.AWS() @@ -47,8 +50,7 @@ def test_write_cluster_config_w_remote_identity(mock_fill_template, *mocks) -> N region=region, zones=zones, dryrun=True, - keep_launch_fields_in_existing_config=True - ) + keep_launch_fields_in_existing_config=True) expected_subset = { 'instance_type': 'fake-type: 3', @@ -65,8 +67,10 @@ def test_write_cluster_config_w_remote_identity(mock_fill_template, *mocks) -> N } mock_fill_template.assert_called_once() - assert mock_fill_template.call_args[0][0] == cluster_config_template, "config template incorrect" - assert mock_fill_template.call_args[0][1].items() >= expected_subset.items(), "config fill values incorrect" + assert mock_fill_template.call_args[0][ + 0] == cluster_config_template, "config template incorrect" + assert mock_fill_template.call_args[0][1].items() >= expected_subset.items( + ), "config fill values incorrect" # test using cluster matches regex, top mock_fill_template.reset_mock() @@ -85,12 +89,13 @@ def test_write_cluster_config_w_remote_identity(mock_fill_template, *mocks) -> N region=region, zones=zones, dryrun=True, - keep_launch_fields_in_existing_config=True - ) + keep_launch_fields_in_existing_config=True) mock_fill_template.assert_called_once() - assert mock_fill_template.call_args[0][0] == cluster_config_template, "config template incorrect" - assert mock_fill_template.call_args[0][1].items() >= expected_subset.items(), "config fill values incorrect" + assert (mock_fill_template.call_args[0][0] == cluster_config_template, + "config template incorrect") + assert (mock_fill_template.call_args[0][1].items() >= + expected_subset.items(), "config fill values incorrect") # test using cluster matches regex, middle mock_fill_template.reset_mock() @@ -109,9 +114,10 @@ def test_write_cluster_config_w_remote_identity(mock_fill_template, *mocks) -> N region=region, zones=zones, dryrun=True, - keep_launch_fields_in_existing_config=True - ) + keep_launch_fields_in_existing_config=True) mock_fill_template.assert_called_once() - assert mock_fill_template.call_args[0][0] == cluster_config_template, "config template incorrect" - assert mock_fill_template.call_args[0][1].items() >= expected_subset.items(), "config fill values incorrect" + assert (mock_fill_template.call_args[0][0] == cluster_config_template, + "config template incorrect") + assert (mock_fill_template.call_args[0][1].items() >= + expected_subset.items(), "config fill values incorrect") From bfa987408b83227a4d341fcaf679237f0d3ba899 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 15/20] fix: remove --- sky/backends/backend_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 92dee9f770b..65c32293b80 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -158,8 +158,6 @@ ('available_node_types', 'ray.head.default', 'node_config', 'IamInstanceProfile'), ('available_node_types', 'ray.head.default', 'node_config', 'UserData'), - ('available_node_types', 'ray.worker.default', 'node_config', - 'IamInstanceProfile'), ('available_node_types', 'ray.worker.default', 'node_config', 'UserData'), ] From e89ede6b47af8bd9a0296195a6bfd614fc4dbfc3 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:52:42 -0400 Subject: [PATCH 16/20] fix: format --- sky/clouds/aws.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 4646b275e86..cb09a3c6bc7 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -399,7 +399,6 @@ def make_deploy_resources_variables( image_id = self._get_image_id(image_id_to_use, region_name, r.instance_type) - user_security_group_config = skypilot_config.get_nested( ('aws', 'security_group_name'), None) user_security_group = None From 7796022880d14a13be778869701e9f30aaed9cef Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 14:57:58 -0400 Subject: [PATCH 17/20] fix: missing resources_utils ClusterName --- sky/execution.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/execution.py b/sky/execution.py index b1fb4ec4164..73ab1f75b47 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -19,6 +19,7 @@ from sky.utils import controller_utils from sky.utils import dag_utils from sky.utils import env_options +from sky.utils import resources_utils from sky.utils import rich_utils from sky.utils import subprocess_utils from sky.utils import timeline @@ -56,7 +57,9 @@ def _maybe_clone_disk_from_cluster(clone_disk_from: Optional[str], f'{clone_disk_from!r}'): image_id = original_cloud.create_image_from_cluster( clone_disk_from, - handle.cluster_name_on_cloud, + resources_utils.ClusterName( + display_name=handle.cluster_name, + name_on_cloud=handle.cluster_name_on_cloud), region=handle.launched_resources.region, zone=handle.launched_resources.zone, ) From 28f090679c340082e61ef55079b8e43706243ebe Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 15:34:49 -0400 Subject: [PATCH 18/20] fix: tests --- tests/unit_tests/test_resources.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit_tests/test_resources.py b/tests/unit_tests/test_resources.py index d7643459ef0..450ca692f0a 100644 --- a/tests/unit_tests/test_resources.py +++ b/tests/unit_tests/test_resources.py @@ -128,6 +128,8 @@ def test_aws_make_deploy_variables(*mocks) -> None: 'docker_image': None, 'docker_container_name': 'sky_container', 'docker_login_config': None, + 'docker_run_options': [], + 'initial_setup_commands': [], 'zones': 'fake-zone' } From 9fd434ebb9256b27053739de238ac7272858a93a Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 16:20:00 -0400 Subject: [PATCH 19/20] fix: bug --- sky/execution.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/execution.py b/sky/execution.py index 73ab1f75b47..d75afddb873 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -56,8 +56,7 @@ def _maybe_clone_disk_from_cluster(clone_disk_from: Optional[str], with rich_utils.safe_status('Creating image from source cluster ' f'{clone_disk_from!r}'): image_id = original_cloud.create_image_from_cluster( - clone_disk_from, - resources_utils.ClusterName( + cluster_name=resources_utils.ClusterName( display_name=handle.cluster_name, name_on_cloud=handle.cluster_name_on_cloud), region=handle.launched_resources.region, From 5641696923a83c5c4a245d769b37cd93162f47a9 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 10 Jul 2024 16:26:33 -0400 Subject: [PATCH 20/20] fix: clone_disk_from reference --- sky/execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/execution.py b/sky/execution.py index d75afddb873..1f6bd09f9c3 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -57,7 +57,7 @@ def _maybe_clone_disk_from_cluster(clone_disk_from: Optional[str], f'{clone_disk_from!r}'): image_id = original_cloud.create_image_from_cluster( cluster_name=resources_utils.ClusterName( - display_name=handle.cluster_name, + display_name=clone_disk_from, name_on_cloud=handle.cluster_name_on_cloud), region=handle.launched_resources.region, zone=handle.launched_resources.zone,