diff --git a/numerai/terraform/aws/-inputs.tf b/numerai/terraform/aws/-inputs.tf index 52c4624..a768013 100644 --- a/numerai/terraform/aws/-inputs.tf +++ b/numerai/terraform/aws/-inputs.tf @@ -27,3 +27,9 @@ variable "gateway_stage_path" { type = string default = "v1" } + +variable "volume_size" { + description = "Size of the EC2 volumes in GB" + type = number + default = 20 +} \ No newline at end of file diff --git a/numerai/terraform/aws/-main.tf b/numerai/terraform/aws/-main.tf index 5e1652c..ff49e8f 100644 --- a/numerai/terraform/aws/-main.tf +++ b/numerai/terraform/aws/-main.tf @@ -23,4 +23,5 @@ module "aws" { nodes = local.aws_nodes node_container_port = var.node_container_port gateway_stage_path = var.gateway_stage_path + volume_size = var.volume_size } diff --git a/numerai/terraform/aws/aws/-inputs.tf b/numerai/terraform/aws/aws/-inputs.tf index bbbbb2e..6f50661 100644 --- a/numerai/terraform/aws/aws/-inputs.tf +++ b/numerai/terraform/aws/aws/-inputs.tf @@ -26,3 +26,9 @@ variable "gateway_stage_path" { type = string default = "v1" } + +variable "volume_size" { + description = "Size of the EC2 volumes in GB" + type = number + default = 20 +} \ No newline at end of file diff --git a/numerai/terraform/aws/aws/cluster.tf b/numerai/terraform/aws/aws/cluster.tf index 87cd245..f420fbc 100644 --- a/numerai/terraform/aws/aws/cluster.tf +++ b/numerai/terraform/aws/aws/cluster.tf @@ -84,12 +84,30 @@ resource "aws_iam_role_policy_attachment" "aws_batch_service_role" { policy_arn = "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole" } +resource "aws_launch_template" "node" { + name = local.node_prefix + + block_device_mappings { + device_name = "/dev/xvda" + + ebs { + volume_size = var.volume_size + volume_type = "gp2" + } + } +} + resource "aws_batch_compute_environment" "node" { - compute_environment_name = local.node_prefix + compute_environment_name_prefix = "${local.node_prefix}-" compute_resources { instance_role = aws_iam_instance_profile.batch_ecs_instance_role.arn + launch_template { + launch_template_id = aws_launch_template.node.id + version = "$Latest" + } + max_vcpus = 64 security_group_ids = [ @@ -145,7 +163,18 @@ resource "aws_batch_job_definition" "node" { retry_strategy { attempts = 2 - + evaluate_on_exit { + on_reason = "CannotInspectContainerError:*" + action = "RETRY" + } + evaluate_on_exit { + on_reason = "CannotPullContainerError:*" + action = "RETRY" + } + evaluate_on_exit { + action = "RETRY" + on_reason = "CannotStartContainerError:*" + } evaluate_on_exit { action = "RETRY" on_reason = "Task failed to start"