From 4248f1b920902ab36a77859daa9fc79b4f4e5ded Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Wed, 17 Mar 2021 01:21:03 +0200 Subject: [PATCH] Docs for multi-instance type clusters (#1968) (cherry picked from commit fce0df92a1acdffcb0d003aa7e78daf685c56a17) --- CONTRIBUTING.md | 20 ++++-- docs/clusters/aws/install.md | 49 +++++++------ docs/clusters/aws/multi-instance-type.md | 81 ++++++++++++++++++++++ docs/clusters/aws/spot.md | 75 ++++++++++---------- docs/clusters/gcp/install.md | 39 +++++------ docs/clusters/gcp/multi-instance-type.md | 69 ++++++++++++++++++ docs/summary.md | 2 + docs/workloads/realtime/troubleshooting.md | 2 +- 8 files changed, 250 insertions(+), 87 deletions(-) create mode 100644 docs/clusters/aws/multi-instance-type.md create mode 100644 docs/clusters/gcp/multi-instance-type.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6bc669fe9b..0fbaaf26e8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -173,9 +173,12 @@ Create `dev/config/cluster-aws.yaml`. Paste the following config, and update `re cluster_name: cortex provider: aws region: # e.g. us-west-2 -instance_type: m5.large -min_instances: 1 -max_instances: 5 + +node_groups: + - name: worker-ng + instance_type: m5.large + min_instances: 1 + max_instances: 5 image_operator: .dkr.ecr..amazonaws.com/cortexlabs/operator:master image_manager: .dkr.ecr..amazonaws.com/cortexlabs/manager:master @@ -210,10 +213,13 @@ project: zone: # e.g. us-east1-c cluster_name: cortex provider: gcp -instance_type: n1-standard-2 -min_instances: 1 -max_instances: 5 -# accelerator_type: nvidia-tesla-k80 # optional + +node_pools: + - name: worker-np + instance_type: n1-standard-2 + min_instances: 1 + max_instances: 5 + # accelerator_type: nvidia-tesla-k80 # optional image_operator: /cortexlabs/operator:master image_manager: gcr.io//cortexlabs/manager:master diff --git a/docs/clusters/aws/install.md b/docs/clusters/aws/install.md index a59811dd8a..9c36a0c6a5 100644 --- a/docs/clusters/aws/install.md +++ b/docs/clusters/aws/install.md @@ -31,23 +31,35 @@ region: us-east-1 # list of availability zones for your region availability_zones: # default: 3 random availability zones in your region, e.g. [us-east-1a, us-east-1b, us-east-1c] -# instance type -instance_type: m5.large - -# minimum number of instances -min_instances: 1 - -# maximum number of instances -max_instances: 5 - -# disk storage size per instance (GB) -instance_volume_size: 50 - -# instance volume type [gp2 | io1 | st1 | sc1] -instance_volume_type: gp2 - -# instance volume iops (only applicable to io1) -# instance_volume_iops: 3000 +# list of cluster node groups; the smaller index, the higher the priority of the node group +node_groups: + - name: ng-cpu # name of the node group + instance_type: m5.large # instance type + min_instances: 1 # minimum number of instances + max_instances: 5 # maximum number of instances + instance_volume_size: 50 # disk storage size per instance (GB) + instance_volume_type: gp2 # instance volume type [gp2 | io1 | st1 | sc1] + # instance_volume_iops: 3000 # instance volume iops (only applicable to io1) + spot: false # enable spot instances + + - name: ng-gpu + instance_type: g4dn.xlarge + min_instances: 1 + max_instances: 5 + instance_volume_size: 50 + instance_volume_type: gp2 + # instance_volume_iops: 3000 + spot: false + + - name: ng-inferentia + instance_type: inf1.xlarge + min_instances: 1 + max_instances: 5 + instance_volume_size: 50 + instance_volume_type: gp2 + # instance_volume_iops: 3000 + spot: false + ... # subnet visibility [public (instances will have public IPs) | private (instances will not have public IPs)] subnet_visibility: public @@ -75,9 +87,6 @@ operator_load_balancer_scheme: internet-facing # additional tags to assign to AWS resources (all resources will automatically be tagged with cortex.dev/cluster-name: ) tags: # : map of key/value pairs -# enable spot instances -spot: false - # SSL certificate ARN (only necessary when using a custom domain) ssl_certificate_arn: diff --git a/docs/clusters/aws/multi-instance-type.md b/docs/clusters/aws/multi-instance-type.md new file mode 100644 index 0000000000..43c56c5b4f --- /dev/null +++ b/docs/clusters/aws/multi-instance-type.md @@ -0,0 +1,81 @@ +# Multi-instance type clusters + +The cluster can be configured to provision different instance types depending on what resources the APIs request. The multi instance type cluster has the following advantages over the single-instance type cluster: + +* **Lower costs**: Reduced overall compute costs by using the most economical instance for the given workloads. +* **Simpler logistics**: Managing multiple clusters on your own is no longer required. +* **Multi-purpose cluster**: The cluster can now take any range of workloads. One cluster for everything. Just throw a bunch of node groups in the cluster config, and you’re set. + +## Best practices + +When specifying the node groups in your `cluster.yaml` config, keep in mind that node groups with lower indexes have a higher priority over the other ones. With that mind, the best practices that result from this are: + +1. Node groups with smaller instances should have the higher priority. +1. Node groups with CPU-only instances should come before the node groups equipped with GPU/Inferentia instances. +1. The spot node groups should always come first over the ones that have on-demand instances. + +## Example node groups + +### CPU spot/on-demand with GPU on-demand + +```yaml +# cluster.yaml + +node_groups: + - name: cpu-spot + instance_type: m5.large + spot: true + - name: cpu + instance_type: m5.large + - name: gpu + instance_type: g4dn.xlarge +``` + +### CPU on-demand, GPU on-demand and Inferentia on-demand + +```yaml +# cluster.yaml + +node_groups: + - name: cpu + instance_type: m5.large + - name: gpu + instance_type: g4dn.xlarge + - name: inferentia + instance_type: inf.xlarge +``` + +### 3 spot CPU node groups with 1 on-demand CPU + +```yaml +# cluster.yaml + +node_groups: + - name: cpu-0 + instance_type: t3.medium + spot: true + - name: cpu-1 + instance_type: m5.2xlarge + spot: true + - name: cpu-2 + instance_type: m5.8xlarge + spot: true + - name: cpu-3 + instance_type: m5.24xlarge +``` + +The above can also be achieved with the following config. + +```yaml +# cluster.yaml + +node_groups: + - name: cpu-0 + instance_type: t3.medium + spot: true + spot_config: + instance_distribution: [m5.2xlarge, m5.8xlarge] + max_price: 3.27 + - name: cpu-1 + instance_type: m5.24xlarge +``` diff --git a/docs/clusters/aws/spot.md b/docs/clusters/aws/spot.md index 71a49f7992..151b0f2ce2 100644 --- a/docs/clusters/aws/spot.md +++ b/docs/clusters/aws/spot.md @@ -3,62 +3,60 @@ ```yaml # cluster.yaml -# whether to use spot instances in the cluster (default: false) -spot: false +node_groups: + - name: node-group-0 -spot_config: - # additional instance types with identical or better specs than the primary cluster instance type (defaults to only the primary instance type) - instance_distribution: # [similar_instance_type_1, similar_instance_type_2] + # whether to use spot instances for this node group (default: false) + spot: false - # minimum number of on demand instances (default: 0) - on_demand_base_capacity: 0 + spot_config: + # additional instance types with identical or better specs than the primary cluster instance type (defaults to only the primary instance type) + instance_distribution: # [similar_instance_type_1, similar_instance_type_2] - # percentage of on demand instances to use after the on demand base capacity has been met [0, 100] (default: 50) - # note: setting this to 0 may hinder cluster scale up when spot instances are not available - on_demand_percentage_above_base_capacity: 0 + # minimum number of on demand instances (default: 0) + on_demand_base_capacity: 0 - # max price for spot instances (default: the on-demand price of the primary instance type) - max_price: # + # percentage of on demand instances to use after the on demand base capacity has been met [0, 100] (default: 50) + # note: setting this to 0 may hinder cluster scale up when spot instances are not available + on_demand_percentage_above_base_capacity: 0 - # number of spot instance pools across which to allocate spot instances [1, 20] (default: number of instances in instance distribution) - instance_pools: 3 + # max price for spot instances (default: the on-demand price of the primary instance type) + max_price: # - # fallback to on-demand instances if spot instances were unable to be allocated (default: true) - on_demand_backup: true + # number of spot instance pools across which to allocate spot instances [1, 20] (default: number of instances in instance distribution) + instance_pools: 3 ``` Spot instances are not guaranteed to be available. The chances of getting spot instances can be improved by providing `instance_distribution`, a list of alternative instance types to the primary `instance_type` you specified. If left blank, Cortex will only include the primary instance type in the `instance_distribution`. When using `instance_distribution`, use the instance type with the fewest compute resources as your primary `instance_type`. Note that the default value for `max_price` is the on-demand price of the primary instance type, but you may wish to set this to the on-demand price of the most expensive instance type in your `instance_distribution`. Spot instances can be mixed with on-demand instances by configuring `on_demand_base_capacity` and `on_demand_percentage_above_base_capacity`. `on_demand_base_capacity` enforces the minimum number of nodes that will be fulfilled by on-demand instances as your cluster is scaling up. `on_demand_percentage_above_base_capacity` defines the percentage of instances that will be on-demand after the base capacity has been fulfilled (the rest being spot instances). `instance_pools` is the number of pools per availability zone to allocate your instances from. See [here](https://docs.aws.amazon.com/autoscaling/ec2/APIReference/API_InstancesDistribution.html) for more details. -Even if multiple instances are specified in your `instance_distribution` on-demand instances are mixed, there is still a possibility of running into scale up issues when attempting to spin up spot instances. Spot instance requests may not be fulfilled for several reasons. Spot instance pricing fluctuates, therefore the `max_price` may be lower than the current spot pricing rate. Another possibility could be that the availability zones of the cluster ran out of spot instances. `on_demand_backup` can be used mitigate the impact of unfulfilled spot requests by enabling the cluster to spin up on-demand instances if spot instance requests are not fulfilled within 5 minutes. +Even if multiple instances are specified in your `instance_distribution` on-demand instances are mixed, there is still a possibility of running into scale up issues when attempting to spin up spot instances. Spot instance requests may not be fulfilled for several reasons. Spot instance pricing fluctuates, therefore the `max_price` may be lower than the current spot pricing rate. Another possibility could be that the availability zones of the cluster ran out of spot instances. The addition of another on-demand node group to `node_groups` with a lower priority (by having a higher index in the `node_groups` list) can mitigate the impact of unfulfilled spot requests by enabling the cluster to spin up on-demand instances if spot instance requests are not fulfilled within 5 minutes. There is a spot instance limit associated with your AWS account for each instance family in each region. You can check your current limit and request an increase [here](https://console.aws.amazon.com/servicequotas/home?#!/services/ec2/quotas) (set the region in the upper right corner to your desired region, type "spot" in the search bar, and click on the quota that matches your instance type). Note that the quota values indicate the number of vCPUs available, not the number of instances; different instances have a different numbers of vCPUs, which can be seen [here](https://aws.amazon.com/ec2/instance-types/). ## Example spot configuration -### Only spot instances with backup +### Only spot instances ```yaml - -spot: true - -spot_config: - on_demand_base_capacity: 0 - on_demand_percentage_above_base_capacity: 0 - on_demand_backup: true # recommended for production clusters +node_groups: + - name: node-group-1 + spot: true ``` ### 3 on-demand base capacity with 0% on-demand above base capacity ```yaml -min_instances: 0 -max_instances: 5 -spot: true -spot_config: - on_demand_base_capacity: 3 - on_demand_percentage_above_base_capacity: 0 +node_groups: + - name: node-group-1 + min_instances: 0 + max_instances: 5 + spot: true + spot_config: + on_demand_base_capacity: 3 + on_demand_percentage_above_base_capacity: 0 # instance 1-3: on-demand # instance 4-5: spot @@ -67,13 +65,14 @@ spot_config: ### 0 on-demand base capacity with 50% on-demand above base capacity ```yaml -min_instances: 0 -max_instances: 4 - -spot: true -spot_config: - on_demand_base_capacity: 0 - on_demand_percentage_above_base_capacity: 50 +node_groups: + - name: node-group-2 + min_instances: 0 + max_instances: 4 + spot: true + spot_config: + on_demand_base_capacity: 0 + on_demand_percentage_above_base_capacity: 50 # instance 1: on-demand # instance 2: spot diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md index b745a52394..8eeabda02a 100644 --- a/docs/clusters/gcp/install.md +++ b/docs/clusters/gcp/install.md @@ -31,27 +31,24 @@ project: # GCP zone for your cluster zone: us-east1-c -# instance type -instance_type: n1-standard-2 - -# minimum number of instances -min_instances: 1 - -# maximum number of instances -max_instances: 5 - -# enable the use of preemptible instances -preemptible: false - -# enable the use of on-demand backup instances which will be used when preemptible capacity runs out -# default is true when preemptible instances are used -# on_demand_backup: true - -# GPU to attach to your instance (optional) -# accelerator_type: nvidia-tesla-t4 - -# the number of GPUs to attach to each instance (optional) -# accelerators_per_instance: 1 +# list of cluster node pools; the smaller index, the higher the priority of the node pool +node_pools: + - name: np-cpu # name of the node pool + instance_type: n1-standard-2 # instance type + # accelerator_type: nvidia-tesla-t4 # GPU to attach to your instance (optional) + # accelerators_per_instance: 1 # the number of GPUs to attach to each instance (optional) + min_instances: 1 # minimum number of instances + max_instances: 5 # maximum number of instances + preemptible: false # enable the use of preemptible instances + + - name: np-gpu + instance_type: n1-standard-2 + accelerator_type: nvidia-tesla-t4 + accelerators_per_instance: 1 + min_instances: 1 + max_instances: 5 + preemptible: false + ... # the name of the network in which to create your cluster # network: default diff --git a/docs/clusters/gcp/multi-instance-type.md b/docs/clusters/gcp/multi-instance-type.md new file mode 100644 index 0000000000..ae36dc4301 --- /dev/null +++ b/docs/clusters/gcp/multi-instance-type.md @@ -0,0 +1,69 @@ +# Multi-instance type clusters + +The cluster can be configured to provision different instance types depending on what resources the APIs request. The multi instance type cluster has the following advantages over the single-instance type cluster: + +* **Lower costs**: Reduced overall compute costs by using the most economical instance for the given workloads. +* **Simpler logistics**: Managing multiple clusters on your own is no longer required. +* **Multi-purpose cluster**: The cluster can now take any range of workloads. One cluster for everything. Just throw a bunch of node pools in the cluster config, and you’re set. + +## Best practices + +When specifying the node pools in your `cluster.yaml` config, keep in mind that node pools with lower indexes have a higher priority over the other ones. With that mind, the best practices that result from this are: + +1. Node pools with smaller instances should have the higher priority. +1. Node pools with CPU-only instances should come before the node pools equipped with GPU instances. +1. The preemptible node pools should always come first over the ones that have on-demand instances. + +## Example node pools + +### CPU preemptible/on-demand with GPU on-demand + +```yaml +# cluster.yaml + +node_pools: + - name: cpu-preempt + instance_type: e2-standard-2 + preemptible: true + - name: cpu + instance_type: e2-standard-2 + - name: gpu + instance_type: e2-standard-2 + accelerator_type: nvidia-tesla-t4 +``` + +### CPU on-demand with 2 GPU on-demand + +```yaml +# cluster.yaml + +node_pools: + - name: cpu + instance_type: e2-standard-2 + - name: gpu-small + instance_type: e2-standard-2 + accelerator_type: nvidia-tesla-t4 + - name: gpu-large + instance_type: e2-standard-2 + accelerator_type: nvidia-tesla-t4 + accelerators_per_instance: 4 +``` + +### 3 preemptible CPU node pools with 1 on-demand CPU + +```yaml +# cluster.yaml + +node_pools: + - name: cpu-0 + instance_type: e2-standard-2 + preemptible: true + - name: cpu-1 + instance_type: e2-standard-4 + preemptible: true + - name: cpu-2 + instance_type: e2-standard-8 + preemptible: true + - name: cpu-3 + instance_type: e2-standard-32 +``` diff --git a/docs/summary.md b/docs/summary.md index 600dd81757..dba1976543 100644 --- a/docs/summary.md +++ b/docs/summary.md @@ -58,6 +58,7 @@ * [Update](clusters/aws/update.md) * [Auth](clusters/aws/auth.md) * [Security](clusters/aws/security.md) + * [Multi-instance type](clusters/aws/multi-instance-type.md) * [Spot instances](clusters/aws/spot.md) * [Networking](clusters/aws/networking/index.md) * [Custom domain](clusters/aws/networking/custom-domain.md) @@ -68,6 +69,7 @@ * GCP * [Install](clusters/gcp/install.md) * [Credentials](clusters/gcp/credentials.md) + * [Multi-instance type](clusters/gcp/multi-instance-type.md) * [Setting up kubectl](clusters/gcp/kubectl.md) * [Uninstall](clusters/gcp/uninstall.md) * [Private Docker registry](clusters/registry.md) diff --git a/docs/workloads/realtime/troubleshooting.md b/docs/workloads/realtime/troubleshooting.md index 7ba71efccf..527122f433 100644 --- a/docs/workloads/realtime/troubleshooting.md +++ b/docs/workloads/realtime/troubleshooting.md @@ -48,7 +48,7 @@ On the old UI: The most common reason AWS is unable to provision instances is that you have reached your instance limit. There is an instance limit associated with your AWS account for each instance family in each region, for on-demand and for spot instances. You can check your current limit and request an increase [here](https://console.aws.amazon.com/servicequotas/home?#!/services/ec2/quotas) (set the region in the upper right corner to your desired region, type "on-demand" or "spot" in the search bar, and click on the quota that matches your instance type). Note that the quota values indicate the number of vCPUs available, not the number of instances; different instances have a different numbers of vCPUs, which can be seen [here](https://aws.amazon.com/ec2/instance-types). -If you are using spot instances and don't have `on_demand_backup` set to true, it is also possible that AWS has run out of spot instances for your requested instance type and region. You can enable `on_demand_backup` to allow Cortex to fall back to on-demand instances when spot instances are unavailable, or you can try adding additional alternative instance types in `instance_distribution`. +If you're using spot instances for your node group, it is also possible that AWS has run out of spot instances for your requested instance type and region. To address this, you can try adding additional alternative instance types in `instance_distribution` or changing the cluster's region to one that has a higher availability. ### Disabling rolling updates