-
Notifications
You must be signed in to change notification settings - Fork 483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Core][AWS] Allow specification of Security Groups for resources. #3501
Changes from all commits
3c3ce61
6d89ce9
c4d500f
2be0d53
54b6091
0582226
9e9e5a6
a990eec
db2b69a
1da5428
fdeaa2d
f0572c3
42ec7c0
d7a0004
bfa9874
e89ede6
7796022
28f0906
9fd434e
5641696
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -155,6 +155,8 @@ | |
# we need to take this field from the new yaml. | ||
('provider', 'tpu_node'), | ||
('provider', 'security_group', 'GroupName'), | ||
('available_node_types', 'ray.head.default', 'node_config', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixes an error with IAM roles if the cluster previously existed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this mean we want to always update the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess my thoughts are if you have a specified cluster name and you change the IAM or SG, I would expect the cluster to update. However, maybe that isn't the desired behavior? Otherwise, one would have to tear down the entire cluster to change it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Michaelvll I did test launching with different names to ensure it changes with the name in serve between controller and worker. In this case the controller and the worker are getting different cluster names. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The behavior sounds fine to me, except that it may cause a risk that the controller may get a different IamInstanceProfile that no longer has permission to access the previous service replicas or managed job clusters. |
||
'IamInstanceProfile'), | ||
('available_node_types', 'ray.head.default', 'node_config', 'UserData'), | ||
('available_node_types', 'ray.worker.default', 'node_config', 'UserData'), | ||
] | ||
|
@@ -793,8 +795,11 @@ def write_cluster_config( | |
# move the check out of this function, i.e. the caller should be responsible | ||
# for the validation. | ||
# TODO(tian): Move more cloud agnostic vars to resources.py. | ||
resources_vars = to_provision.make_deploy_variables(cluster_name_on_cloud, | ||
region, zones, dryrun) | ||
resources_vars = to_provision.make_deploy_variables( | ||
resources_utils.ClusterName( | ||
cluster_name, | ||
cluster_name_on_cloud, | ||
), region, zones, dryrun) | ||
config_dict = {} | ||
|
||
specific_reservations = set( | ||
|
@@ -803,11 +808,13 @@ def write_cluster_config( | |
|
||
assert cluster_name is not None | ||
excluded_clouds = [] | ||
remote_identity = skypilot_config.get_nested( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Michaelvll fixed a bug here when there was no match. |
||
(str(cloud).lower(), 'remote_identity'), | ||
schemas.get_default_remote_identity(str(cloud).lower())) | ||
if remote_identity is not None and not isinstance(remote_identity, str): | ||
for profile in remote_identity: | ||
remote_identity_config = skypilot_config.get_nested( | ||
(str(cloud).lower(), 'remote_identity'), None) | ||
remote_identity = schemas.get_default_remote_identity(str(cloud).lower()) | ||
if isinstance(remote_identity_config, str): | ||
remote_identity = remote_identity_config | ||
if isinstance(remote_identity_config, list): | ||
for profile in remote_identity_config: | ||
if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]): | ||
remote_identity = list(profile.values())[0] | ||
break | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
"""Amazon Web Services.""" | ||
import enum | ||
import fnmatch | ||
import functools | ||
import json | ||
import os | ||
|
@@ -370,12 +371,13 @@ def get_vcpus_mem_from_instance_type( | |
return service_catalog.get_vcpus_mem_from_instance_type(instance_type, | ||
clouds='aws') | ||
|
||
def make_deploy_resources_variables(self, | ||
resources: 'resources_lib.Resources', | ||
cluster_name_on_cloud: str, | ||
region: 'clouds.Region', | ||
zones: Optional[List['clouds.Zone']], | ||
dryrun: bool = False) -> Dict[str, Any]: | ||
def make_deploy_resources_variables( | ||
self, | ||
resources: 'resources_lib.Resources', | ||
cluster_name: resources_utils.ClusterName, | ||
region: 'clouds.Region', | ||
zones: Optional[List['clouds.Zone']], | ||
dryrun: bool = False) -> Dict[str, Any]: | ||
del dryrun # unused | ||
assert zones is not None, (region, zones) | ||
|
||
|
@@ -397,18 +399,32 @@ def make_deploy_resources_variables(self, | |
image_id = self._get_image_id(image_id_to_use, region_name, | ||
r.instance_type) | ||
|
||
user_security_group = skypilot_config.get_nested( | ||
user_security_group_config = skypilot_config.get_nested( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this fixes a bug if no match is found when a list is provided |
||
('aws', 'security_group_name'), None) | ||
if resources.ports is not None: | ||
# Already checked in Resources._try_validate_ports | ||
assert user_security_group is None | ||
security_group = USER_PORTS_SECURITY_GROUP_NAME.format( | ||
cluster_name_on_cloud) | ||
elif user_security_group is not None: | ||
assert resources.ports is None | ||
security_group = user_security_group | ||
else: | ||
user_security_group = None | ||
if isinstance(user_security_group_config, str): | ||
user_security_group = user_security_group_config | ||
elif isinstance(user_security_group_config, list): | ||
for profile in user_security_group_config: | ||
if fnmatch.fnmatchcase(cluster_name.display_name, | ||
list(profile.keys())[0]): | ||
user_security_group = list(profile.values())[0] | ||
break | ||
security_group = user_security_group | ||
if security_group is None: | ||
security_group = DEFAULT_SECURITY_GROUP_NAME | ||
if resources.ports is not None: | ||
# Already checked in Resources._try_validate_ports | ||
security_group = USER_PORTS_SECURITY_GROUP_NAME.format( | ||
cluster_name.display_name) | ||
elif resources.ports is not None: | ||
with ux_utils.print_exception_no_traceback(): | ||
logger.warning( | ||
f'Skip opening ports {resources.ports} for cluster {cluster_name!r}, ' | ||
'as `aws.security_group_name` in `~/.sky/config.yaml` is specified as ' | ||
f' {security_group!r}. Please make sure the specified security group ' | ||
'has requested ports setup; or, leave out `aws.security_group_name` ' | ||
'in `~/.sky/config.yaml`.') | ||
|
||
return { | ||
'instance_type': r.instance_type, | ||
|
@@ -840,22 +856,24 @@ def query_status(cls, name: str, tag_filters: Dict[str, str], | |
assert False, 'This code path should not be used.' | ||
|
||
@classmethod | ||
def create_image_from_cluster(cls, cluster_name: str, | ||
cluster_name_on_cloud: str, | ||
def create_image_from_cluster(cls, | ||
cluster_name: resources_utils.ClusterName, | ||
region: Optional[str], | ||
zone: Optional[str]) -> str: | ||
assert region is not None, (cluster_name, cluster_name_on_cloud, region) | ||
assert region is not None, (cluster_name.display_name, | ||
cluster_name.name_on_cloud, region) | ||
del zone # unused | ||
|
||
image_name = f'skypilot-{cluster_name}-{int(time.time())}' | ||
image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}' | ||
|
||
status = provision_lib.query_instances('AWS', cluster_name_on_cloud, | ||
status = provision_lib.query_instances('AWS', | ||
cluster_name.name_on_cloud, | ||
{'region': region}) | ||
instance_ids = list(status.keys()) | ||
if not instance_ids: | ||
with ux_utils.print_exception_no_traceback(): | ||
raise RuntimeError( | ||
f'Failed to find the source cluster {cluster_name!r} on ' | ||
f'Failed to find the source cluster {cluster_name.display_name!r} on ' | ||
'AWS.') | ||
|
||
if len(instance_ids) != 1: | ||
|
@@ -882,7 +900,7 @@ def create_image_from_cluster(cls, cluster_name: str, | |
stream_logs=True) | ||
|
||
rich_utils.force_update_status( | ||
f'Waiting for the source image {cluster_name!r} from {region} to be available on AWS.' | ||
f'Waiting for the source image {cluster_name.display_name!r} from {region} to be available on AWS.' | ||
) | ||
# Wait for the image to be available | ||
wait_image_cmd = ( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to match the format of remote identity.