Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[k8s] Show hints when requested resources don't fit in Kubernetes cluster #3590

Merged
merged 10 commits into from
Jul 22, 2024
14 changes: 7 additions & 7 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ def make_deploy_resources_variables(self,

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems the return value is getting longer. Could we also do a refactoring for it in this PR, e.g., creating a data class for the return type to make it simpler?

@dataclasses.dataclass
class FeasibleResourcesWithHints:
    pass

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be fixed now! Added resource_utils.FeasibleResources.

if resources.instance_type is not None:
assert resources.is_launchable(), resources
# Check the instance type is valid in the cloud
Expand All @@ -436,10 +436,10 @@ def _get_feasible_launchable_resources(
region=resources.region,
zone=resources.zone)
if not regions:
return ([], [])
return ([], [], None)
# Treat Resources(AWS, p3.2x, V100) as Resources(AWS, p3.2x).
resources = resources.copy(accelerators=None)
return ([resources], [])
return ([resources], [], None)

def _make(instance_list):
resource_list = []
Expand All @@ -465,9 +465,9 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return ([], [], None)
else:
return (_make([default_instance_type]), [])
return (_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -482,8 +482,8 @@ def _make(instance_list):
zone=resources.zone,
clouds='aws')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)

@classmethod
@functools.lru_cache(maxsize=1) # Cache since getting identity is slow.
Expand Down
14 changes: 7 additions & 7 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,17 +376,17 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:

def _get_feasible_launchable_resources(
self, resources: 'resources.Resources'
) -> Tuple[List['resources.Resources'], List[str]]:
) -> Tuple[List['resources.Resources'], List[str], Optional[str]]:
if resources.instance_type is not None:
assert resources.is_launchable(), resources
ok, _ = Azure.check_disk_tier(resources.instance_type,
resources.disk_tier)
if not ok:
return ([], [])
return ([], [], None)
# Treat Resources(Azure, Standard_NC4as_T4_v3, T4) as
# Resources(Azure, Standard_NC4as_T4_v3).
resources = resources.copy(accelerators=None)
return ([resources], [])
return ([resources], [], None)

def _make(instance_list):
resource_list = []
Expand Down Expand Up @@ -416,9 +416,9 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return ([], [], None)
else:
return (_make([default_instance_type]), [])
return (_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -433,8 +433,8 @@ def _make(instance_list):
zone=resources.zone,
clouds='azure')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
11 changes: 7 additions & 4 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,15 +344,18 @@ def get_feasible_launchable_resources(
self,
resources: 'resources_lib.Resources',
num_nodes: int = 1
) -> Tuple[List['resources_lib.Resources'], List[str]]:
"""Returns ([feasible and launchable resources], [fuzzy candidates]).
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
"""Returns ([feasible & launchable resources], [fuzzy candidates], hint)

Feasible resources refer to an offering respecting the resource
requirements. Currently, this function implements "filtering" the
cloud's offerings only w.r.t. accelerators constraints.

Launchable resources require a cloud and an instance type be assigned.

The cloud may optionally return a string hint to the user if no feasible
resources are found.

Fuzzy candidates example: when the requested GPU is A100:1 but is not
available in a cloud/region, the fuzzy candidates are results of a fuzzy
search in the catalog that are offered in the location. E.g.,
Expand All @@ -372,12 +375,12 @@ def get_feasible_launchable_resources(
# TODO(zhwu): The resources are now silently filtered out. We
# should have some logging telling the user why the resources
# are not considered.
return ([], [])
return ([], [], None)
return self._get_feasible_launchable_resources(resources)

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
"""See get_feasible_launchable_resources()."""
raise NotImplementedError

Expand Down
12 changes: 6 additions & 6 deletions sky/clouds/cudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,11 @@ def make_deploy_resources_variables(
def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'):
if resources.use_spot:
return ([], [])
return ([], [], None)
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], [])
return ([resources], [], None)

def _make(instance_list):
resource_list = []
Expand All @@ -230,9 +230,9 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return ([], [], None)
else:
return (_make([default_instance_type]), [])
return (_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -247,8 +247,8 @@ def _make(instance_list):
zone=resources.zone,
clouds='cudo')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
10 changes: 5 additions & 5 deletions sky/clouds/fluidstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def _get_feasible_launchable_resources(
assert resources.is_launchable(), resources
# Accelerators are part of the instance type in Fluidstack Cloud
resources = resources.copy(accelerators=None)
return ([resources], [])
return ([resources], [], None)

def _make(instance_list):
resource_list = []
Expand Down Expand Up @@ -238,9 +238,9 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return ([], [], None)
else:
return (_make([default_instance_type]), [])
return (_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -255,8 +255,8 @@ def _make(instance_list):
zone=resources.zone,
clouds='fluidstack')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
20 changes: 10 additions & 10 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,10 +497,10 @@ def make_deploy_resources_variables(

def _get_feasible_launchable_resources(
self, resources: 'resources.Resources'
) -> Tuple[List['resources.Resources'], List[str]]:
) -> Tuple[List['resources.Resources'], List[str], Optional[str]]:
if resources.instance_type is not None:
assert resources.is_launchable(), resources
return ([resources], [])
return ([resources], [], None)

if resources.accelerators is None:
# Return a default instance type with the given number of vCPUs.
Expand All @@ -509,7 +509,7 @@ def _get_feasible_launchable_resources(
memory=resources.memory,
disk_tier=resources.disk_tier)
if host_vm_type is None:
return ([], [])
return ([], [], None)
else:
r = resources.copy(
cloud=GCP(),
Expand All @@ -518,7 +518,7 @@ def _get_feasible_launchable_resources(
cpus=None,
memory=None,
)
return ([r], [])
return ([r], [], None)

# Find instance candidates to meet user's requirements
assert len(resources.accelerators.items()
Expand All @@ -540,7 +540,7 @@ def _get_feasible_launchable_resources(
clouds='gcp')

if instance_list is None:
return ([], fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
assert len(
instance_list
) == 1, f'More than one instance type matched, {instance_list}'
Expand All @@ -555,11 +555,11 @@ def _get_feasible_launchable_resources(
if resources.cpus.endswith('+'):
cpus = float(resources.cpus[:-1])
if cpus > num_cpus_in_tpu_vm:
return ([], fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
else:
cpus = float(resources.cpus)
if cpus != num_cpus_in_tpu_vm:
return ([], fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
# FIXME(woosuk, wei-lin): This leverages the fact that TPU VMs
# have 334 GB RAM, and 400 GB RAM for tpu-v4. We need to move
# this to service catalog, instead.
Expand All @@ -568,11 +568,11 @@ def _get_feasible_launchable_resources(
if resources.memory.endswith('+'):
memory = float(resources.memory[:-1])
if memory > memory_in_tpu_vm:
return ([], fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
else:
memory = float(resources.memory)
if memory != memory_in_tpu_vm:
return ([], fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
else:
host_vm_type = instance_list[0]

Expand All @@ -584,7 +584,7 @@ def _get_feasible_launchable_resources(
cpus=None,
memory=None,
)
return ([r], fuzzy_candidate_list)
return ([r], fuzzy_candidate_list, None)

@classmethod
def get_accelerators_from_instance_type(
Expand Down
12 changes: 6 additions & 6 deletions sky/clouds/ibm.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,12 +266,12 @@ def get_default_instance_type(

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
fuzzy_candidate_list: List[str] = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list)
return ([resources], fuzzy_candidate_list, None)

def _make(instance_list):
resource_list = []
Expand All @@ -296,9 +296,9 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return ([], [], None)
else:
return (_make([default_instance_type]), [])
return (_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -312,8 +312,8 @@ def _make(instance_list):
zone=resources.zone,
clouds='ibm')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)

@classmethod
def get_default_image(cls, region) -> str:
Expand Down
8 changes: 4 additions & 4 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,12 +318,12 @@ def make_deploy_resources_variables(

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
fuzzy_candidate_list: List[str] = []
if resources.instance_type is not None:
assert resources.is_launchable(), resources
resources = resources.copy(accelerators=None)
return ([resources], fuzzy_candidate_list)
return ([resources], fuzzy_candidate_list, None)

def _make(instance_list):
resource_list = []
Expand Down Expand Up @@ -379,10 +379,10 @@ def _make(instance_list):
logger.debug(f'Instance type {chosen_instance_type} does '
'not fit in the Kubernetes cluster. '
f'Reason: {reason}')
return [], []
return [], [], reason

# No fuzzy lists for Kubernetes
return _make([chosen_instance_type]), []
return _make([chosen_instance_type]), [], None

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
12 changes: 6 additions & 6 deletions sky/clouds/lambda_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,12 @@ def make_deploy_resources_variables(

def _get_feasible_launchable_resources(
self, resources: 'resources_lib.Resources'
) -> Tuple[List['resources_lib.Resources'], List[str]]:
) -> Tuple[List['resources_lib.Resources'], List[str], Optional[str]]:
if resources.instance_type is not None:
assert resources.is_launchable(), resources
# Accelerators are part of the instance type in Lambda Cloud
resources = resources.copy(accelerators=None)
return ([resources], [])
return ([resources], [], None)

def _make(instance_list):
resource_list = []
Expand All @@ -209,9 +209,9 @@ def _make(instance_list):
memory=resources.memory,
disk_tier=resources.disk_tier)
if default_instance_type is None:
return ([], [])
return ([], [], None)
else:
return (_make([default_instance_type]), [])
return (_make([default_instance_type]), [], None)

assert len(accelerators) == 1, resources
acc, acc_count = list(accelerators.items())[0]
Expand All @@ -226,8 +226,8 @@ def _make(instance_list):
zone=resources.zone,
clouds='lambda')
if instance_list is None:
return ([], fuzzy_candidate_list)
return (_make(instance_list), fuzzy_candidate_list)
return ([], fuzzy_candidate_list, None)
return (_make(instance_list), fuzzy_candidate_list, None)

@classmethod
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
Expand Down
Loading
Loading