Skip to content

Commit

Permalink
add 20gb additional storage and retry strategy to aws batch
Browse files Browse the repository at this point in the history
  • Loading branch information
ndharasz committed Jun 13, 2024
1 parent fe4b718 commit bbcedfd
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 2 deletions.
6 changes: 6 additions & 0 deletions numerai/terraform/aws/-inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,9 @@ variable "gateway_stage_path" {
type = string
default = "v1"
}

variable "volume_size" {
description = "Size of the EC2 volumes in GB"
type = number
default = 20
}
1 change: 1 addition & 0 deletions numerai/terraform/aws/-main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ module "aws" {
nodes = local.aws_nodes
node_container_port = var.node_container_port
gateway_stage_path = var.gateway_stage_path
volume_size = var.volume_size
}
6 changes: 6 additions & 0 deletions numerai/terraform/aws/aws/-inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,9 @@ variable "gateway_stage_path" {
type = string
default = "v1"
}

variable "volume_size" {
description = "Size of the EC2 volumes in GB"
type = number
default = 20
}
33 changes: 31 additions & 2 deletions numerai/terraform/aws/aws/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,30 @@ resource "aws_iam_role_policy_attachment" "aws_batch_service_role" {
policy_arn = "arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole"
}

resource "aws_launch_template" "node" {
name = local.node_prefix

block_device_mappings {
device_name = "/dev/xvda"

ebs {
volume_size = var.volume_size
volume_type = "gp2"
}
}
}

resource "aws_batch_compute_environment" "node" {
compute_environment_name = local.node_prefix
compute_environment_name_prefix = "${local.node_prefix}-"

compute_resources {
instance_role = aws_iam_instance_profile.batch_ecs_instance_role.arn

launch_template {
launch_template_id = aws_launch_template.node.id
version = "$Latest"
}

max_vcpus = 64

security_group_ids = [
Expand Down Expand Up @@ -145,7 +163,18 @@ resource "aws_batch_job_definition" "node" {

retry_strategy {
attempts = 2

evaluate_on_exit {
on_reason = "CannotInspectContainerError:*"
action = "RETRY"
}
evaluate_on_exit {
on_reason = "CannotPullContainerError:*"
action = "RETRY"
}
evaluate_on_exit {
action = "RETRY"
on_reason = "CannotStartContainerError:*"
}
evaluate_on_exit {
action = "RETRY"
on_reason = "Task failed to start"
Expand Down

0 comments on commit bbcedfd

Please sign in to comment.