diff --git a/packages/aws-cdk-lib/aws-eks/README.md b/packages/aws-cdk-lib/aws-eks/README.md index f5d2c00c9aca3..bbd31e8987b76 100644 --- a/packages/aws-cdk-lib/aws-eks/README.md +++ b/packages/aws-cdk-lib/aws-eks/README.md @@ -228,8 +228,8 @@ cluster.addNodegroupCapacity('custom-node-group', { }); ``` -> **NOTE:** If you add instances with the inferentia (`inf1` or `inf2`) class the -> [neuron plugin](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/dlc-then-eks-devflow.html) +> **NOTE:** If you add instances with the inferentia class (`inf1` or `inf2`) or trainium class (`trn1` or `trn1n`) +> the [neuron plugin](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/containers/dlc-then-eks-devflow.html) > will be automatically installed in the kubernetes cluster. #### Node Groups with IPv6 Support diff --git a/packages/aws-cdk-lib/aws-eks/lib/cluster.ts b/packages/aws-cdk-lib/aws-eks/lib/cluster.ts index ba3189b798f22..6ba9eec7d19ef 100644 --- a/packages/aws-cdk-lib/aws-eks/lib/cluster.ts +++ b/packages/aws-cdk-lib/aws-eks/lib/cluster.ts @@ -1800,7 +1800,8 @@ export class Cluster extends ClusterBase { spotInterruptHandler: options.spotInterruptHandler, }); - if (nodeTypeForInstanceType(options.instanceType) === NodeType.INFERENTIA) { + if (nodeTypeForInstanceType(options.instanceType) === NodeType.INFERENTIA || + nodeTypeForInstanceType(options.instanceType) === NodeType.TRAINIUM ) { this.addNeuronDevicePlugin(); } @@ -1821,7 +1822,12 @@ export class Cluster extends ClusterBase { options?.instanceType, ...options?.instanceTypes ?? [], ].some(i => i && nodeTypeForInstanceType(i) === NodeType.INFERENTIA); - if (hasInferentiaInstanceType) { + const hasTrainiumInstanceType = [ + options?.instanceType, + ...options?.instanceTypes ?? [], + ].some(i => i && nodeTypeForInstanceType(i) === NodeType.TRAINIUM); + + if (hasInferentiaInstanceType || hasTrainiumInstanceType) { this.addNeuronDevicePlugin(); } return new Nodegroup(this, `Nodegroup${id}`, { @@ -2373,6 +2379,7 @@ export class EksOptimizedImage implements ec2.IMachineImage { 'amazon-linux-2/' : 'amazon-linux-2-arm64/' : '') + (this.nodeType === NodeType.GPU ? 'amazon-linux-2-gpu/' : '') + (this.nodeType === NodeType.INFERENTIA ? 'amazon-linux-2-gpu/' : '') + + (this.nodeType === NodeType.TRAINIUM ? 'amazon-linux-2-gpu/' : '') + 'recommended/image_id'; } @@ -2410,6 +2417,11 @@ export enum NodeType { * Inferentia instances */ INFERENTIA = 'INFERENTIA', + + /** + * Trainium instances + */ + TRAINIUM = 'TRAINIUM', } /** @@ -2473,7 +2485,8 @@ export enum MachineImageType { function nodeTypeForInstanceType(instanceType: ec2.InstanceType) { return INSTANCE_TYPES.gpu.includes(instanceType.toString().substring(0, 2)) ? NodeType.GPU : INSTANCE_TYPES.inferentia.includes(instanceType.toString().substring(0, 4)) ? NodeType.INFERENTIA : - NodeType.STANDARD; + INSTANCE_TYPES.trainium.includes(instanceType.toString().substring(0, 4)) ? NodeType.TRAINIUM : + NodeType.STANDARD; } function cpuArchForInstanceType(instanceType: ec2.InstanceType) { diff --git a/packages/aws-cdk-lib/aws-eks/lib/instance-types.ts b/packages/aws-cdk-lib/aws-eks/lib/instance-types.ts index af322ba5e2abd..164e82b0f840c 100644 --- a/packages/aws-cdk-lib/aws-eks/lib/instance-types.ts +++ b/packages/aws-cdk-lib/aws-eks/lib/instance-types.ts @@ -4,4 +4,5 @@ export const INSTANCE_TYPES = { graviton: ['a1'], graviton2: ['c6g', 'm6g', 'r6g', 't4g'], graviton3: ['c7g'], + trainium: ['trn1', 'trn1n'], }; diff --git a/packages/aws-cdk-lib/aws-eks/test/cluster.test.ts b/packages/aws-cdk-lib/aws-eks/test/cluster.test.ts index 7bc91c4a0cf84..6b7367d6fc5f6 100644 --- a/packages/aws-cdk-lib/aws-eks/test/cluster.test.ts +++ b/packages/aws-cdk-lib/aws-eks/test/cluster.test.ts @@ -2209,6 +2209,42 @@ describe('cluster', () => { Manifest: JSON.stringify([sanitized]), }); }); + test('trn1 instances are supported', () => { + // GIVEN + const { stack } = testFixtureNoVpc(); + const cluster = new eks.Cluster(stack, 'Cluster', { defaultCapacity: 0, version: CLUSTER_VERSION, prune: false }); + + // WHEN + cluster.addAutoScalingGroupCapacity('TrainiumInstances', { + instanceType: new ec2.InstanceType('trn1.2xlarge'), + minCapacity: 1, + }); + const fileContents = fs.readFileSync(path.join(__dirname, '..', 'lib', 'addons', 'neuron-device-plugin.yaml'), 'utf8'); + const sanitized = YAML.parse(fileContents); + + // THEN + Template.fromStack(stack).hasResourceProperties(eks.KubernetesManifest.RESOURCE_TYPE, { + Manifest: JSON.stringify([sanitized]), + }); + }); + test('trn1n instances are supported', () => { + // GIVEN + const { stack } = testFixtureNoVpc(); + const cluster = new eks.Cluster(stack, 'Cluster', { defaultCapacity: 0, version: CLUSTER_VERSION, prune: false }); + + // WHEN + cluster.addAutoScalingGroupCapacity('TrainiumInstances', { + instanceType: new ec2.InstanceType('trn1n.2xlarge'), + minCapacity: 1, + }); + const fileContents = fs.readFileSync(path.join(__dirname, '..', 'lib', 'addons', 'neuron-device-plugin.yaml'), 'utf8'); + const sanitized = YAML.parse(fileContents); + + // THEN + Template.fromStack(stack).hasResourceProperties(eks.KubernetesManifest.RESOURCE_TYPE, { + Manifest: JSON.stringify([sanitized]), + }); + }); test('inf1 instances are supported in addNodegroupCapacity', () => { // GIVEN