Skip to content

Commit

Permalink
Merge pull request #83 from numerai/ndharasz/handle-cycle-error
Browse files Browse the repository at this point in the history
fix AWS cycle error
  • Loading branch information
ndharasz committed Dec 9, 2023
2 parents 5b9d809 + c7a6270 commit 131d1bc
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 51 deletions.
65 changes: 56 additions & 9 deletions numerai/cli/destroy_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
import click
from numerapi import base_api

from numerai.cli.node import get_models
from numerai.cli.constants import *
from numerai.cli.node.destroy import destroy_node
from numerai.cli.util.files import load_or_init_nodes
from numerai.cli.util.docker import terraform
from numerai.cli.util.files import load_or_init_nodes, store_config, copy_file
from numerai.cli.util.keys import get_provider_keys, get_numerai_keys


@click.command("destroy-all", help="Destroy all nodes")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--preserve-node-config", "-p", is_flag=True)
@click.pass_context
def destroy_all(ctx, verbose):
def destroy_all(ctx, verbose, preserve_node_config):
"""
Uses Terraform to destroy a Numerai Compute clusters for both Tournament and Signals.
This will delete everything, including:
Expand All @@ -30,9 +31,55 @@ def destroy_all(ctx, verbose):
exit(0)

nodes_config = load_or_init_nodes()
models = get_models(TOURNAMENT_NUMERAI)

for _, model in models.items():
node = model["name"]
if node in nodes_config:
destroy_node(node, verbose)
if len(nodes_config) == 0:
click.secho("No nodes to destroy", fg="green")
return

try:
provider_keys = {
nodes_config[node]["provider"]: get_provider_keys(node)
for node in nodes_config
}
except (KeyError, FileNotFoundError) as e:
click.secho(
f"make sure you run `numerai setup` and " f"`numerai node config` first...",
fg="red",
)
return

click.secho("backing up nodes.json and deleting current config...")
copy_file(NODES_PATH, f"{NODES_PATH}.backup", force=True, verbose=True)
store_config(NODES_PATH, {})

try:
click.secho(f"destroying nodes...")
for provider, provider_keys in provider_keys.items():
click.secho(f"deleting cloud resources for {provider}...")
terraform(
"destroy -auto-approve",
verbose,
provider,
env_vars=provider_keys,
inputs={"node_config_file": "nodes.json"},
)

except Exception as e:
click.secho(e.__str__(), fg="red")
click.secho("restoring nodes.json...", fg="green")
store_config(NODES_PATH, nodes_config)
return

napi = base_api.Api(*get_numerai_keys())
for node, node_config in nodes_config.items():
if "model_id" in node_config and "webhook_url" in node_config:
model_id = node_config["model_id"]
webhook_url = node_config["webhook_url"]
click.echo(f"deregistering webhook {webhook_url} for model {model_id}...")
napi.set_submission_webhook(model_id, None)

click.secho("Prediction Nodes destroyed successfully", fg="green")

if preserve_node_config:
click.secho("restoring nodes.json...", fg="green")
store_config(NODES_PATH, nodes_config)
48 changes: 38 additions & 10 deletions numerai/cli/node/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
"--provider",
"-P",
type=str,
help=f"Select a cloud provider. One of {PROVIDERS}. " f"Defaults to {DEFAULT_PROVIDER}.",
help=f"Select a cloud provider. One of {PROVIDERS}. "
f"Defaults to {DEFAULT_PROVIDER}.",
)
@click.option(
"--size",
Expand All @@ -47,7 +48,8 @@
@click.option(
"--cpu",
type=str,
help=f"For AWS only, CPUs to allocate to your node" f"Defaults to 2 (run `numerai list-constants` to see options).",
help=f"For AWS only, CPUs to allocate to your node"
f"Defaults to 2 (run `numerai list-constants` to see options).",
)
@click.option(
"--memory",
Expand Down Expand Up @@ -94,7 +96,17 @@
)
@click.pass_context
def config(
ctx, verbose, provider, size, cpu, memory, path, example, cron, timeout_minutes, register_webhook
ctx,
verbose,
provider,
size,
cpu,
memory,
path,
example,
cron,
timeout_minutes,
register_webhook,
):
"""
Uses Terraform to create a full Numerai Compute cluster in your desired provider.
Expand Down Expand Up @@ -133,7 +145,11 @@ def config(
affected_providers = set(filter(None, affected_providers))

nodes_config[node].update(
{key: default for key, default in DEFAULT_SETTINGS.items() if key not in nodes_config[node]}
{
key: default
for key, default in DEFAULT_SETTINGS.items()
if key not in nodes_config[node]
}
)
# update node as needed
node_conf = nodes_config[node]
Expand All @@ -147,29 +163,39 @@ def config(
provider = node_conf["provider"]

if timeout_minutes and provider == PROVIDER_AZURE:
click.secho("Timeout settings are unavailable for Azure and this input will be ignored.", fg="yellow")
click.secho(
"Timeout settings are unavailable for Azure and this input will be ignored.",
fg="yellow",
)
elif timeout_minutes:
node_conf["timeout_minutes"] = timeout_minutes

if provider == PROVIDER_GCP and size is not None and "mem-" in size:
click.secho(
"Invalid size: mem sizes are invalid for GCP due to sizing constraints with Google Cloud Run.", fg="red"
"Invalid size: mem sizes are invalid for GCP due to sizing constraints with Google Cloud Run.",
fg="red",
)
click.secho(
"Visit https://cloud.google.com/run/docs/configuring/services/memory-limits to learn more.", fg="red"
"Visit https://cloud.google.com/run/docs/configuring/services/memory-limits to learn more.",
fg="red",
)
exit(1)

if size and (cpu or memory):
click.secho("Cannot provide size and CPU or Memory. Either use size or provide CPU and Memory.", fg="red")
click.secho(
"Cannot provide size and CPU or Memory. Either use size or provide CPU and Memory.",
fg="red",
)
exit(1)
if (cpu or memory) and node_conf["provider"] != PROVIDER_AWS:
click.secho(
"Specifying CPU and Memory is only valid for AWS nodes. (run `numerai list-constants` to see options).",
fg="red",
)
exit(1)
elif (cpu or memory) and (not (cpu or node_conf["cpu"]) or not (memory or node_conf["memory"])):
elif (cpu or memory) and (
not (cpu or node_conf["cpu"]) or not (memory or node_conf["memory"])
):
click.secho(
"One of CPU and Memory is missing either from your options or from your node configuration."
"Provide both CPU and Memory to configure node size, or use size."
Expand Down Expand Up @@ -218,7 +244,9 @@ def config(

# Azure only: Need to create a master Azure Container Registry and push a dummy placeholder image, before deploying the rest of the resources
if provider == "azure":
provider_registry_conf = create_azure_registry(provider, provider_keys, verbose=verbose)
provider_registry_conf = create_azure_registry(
provider, provider_keys, verbose=verbose
)
node_conf.update(provider_registry_conf)
node_conf["docker_repo"] = f'{node_conf["acr_login_server"]}/{node}'
docker.login(node_conf, verbose)
Expand Down
51 changes: 25 additions & 26 deletions numerai/cli/node/destroy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,27 @@
from numerai.cli.util.keys import get_provider_keys, get_numerai_keys


def destroy_node(node, verbose, preserve_node_config=False):
@click.command()
@click.option("--preserve-node-config", "-p", is_flag=True)
@click.option("--verbose", "-v", is_flag=True)
@click.pass_context
def destroy(ctx, preserve_node_config, verbose):
"""
Uses Terraform to destroy a Numerai Compute cluster.
This will delete everything, including:
- lambda url
- docker container and associated task
- all logs
This command is idempotent and safe to run multiple times.
"""

ctx.ensure_object(dict)
model = ctx.obj["model"]
node = model["name"]
if not os.path.exists(CONFIG_PATH):
click.secho(".numerai directory not setup, run `numerai setup`...", fg="red")
return

try:
nodes_config = load_or_init_nodes()
node_config = nodes_config[node]
Expand All @@ -21,6 +41,10 @@ def destroy_node(node, verbose, preserve_node_config=False):
fg="red",
)
return

if not preserve_node_config:
click.secho("backing up nodes.json...")
copy_file(NODES_PATH, f"{NODES_PATH}.backup", force=True, verbose=True)

try:
click.secho(
Expand All @@ -30,7 +54,6 @@ def destroy_node(node, verbose, preserve_node_config=False):
)
del nodes_config[node]
store_config(NODES_PATH, nodes_config)
copy_file(NODES_PATH, f"{CONFIG_PATH}/{provider}/", force=True, verbose=True)

click.secho("deleting cloud resources for node...")
terraform(
Expand Down Expand Up @@ -60,27 +83,3 @@ def destroy_node(node, verbose, preserve_node_config=False):
click.secho("re-adding node config to nodes.json...", fg="green")
nodes_config[node] = node_config
store_config(NODES_PATH, nodes_config)


@click.command()
@click.option("--preserve-node-config", "-p", is_flag=True)
@click.option("--verbose", "-v", is_flag=True)
@click.pass_context
def destroy(ctx, preserve_node_config, verbose):
"""
Uses Terraform to destroy a Numerai Compute cluster.
This will delete everything, including:
- lambda url
- docker container and associated task
- all logs
This command is idempotent and safe to run multiple times.
"""

ctx.ensure_object(dict)
model = ctx.obj["model"]
node = model["name"]
if not os.path.exists(CONFIG_PATH):
click.secho(".numerai directory not setup, run `numerai setup`...", fg="red")
return

destroy_node(node, verbose, preserve_node_config)
11 changes: 6 additions & 5 deletions numerai/cli/util/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,12 @@ def root_cause(std_out, err_msg):

if b"Cycle" in std_out:
raise exception_with_msg(
"You upgraded to 1.0+ and need to replace your AWS nodes."
"\nRun the following commands:"
"\n numerai node -m <model_name> destroy --preserve-node-config"
"\n numerai node -m <model_name> config"
"\n numerai node -m <model_name> deploy"
"You upgraded to 1.0+ and need to replace your AWS nodes before continuing!"
"\nTo do this now follow these instructions:"
'\n run "numerai destroy-all --preserve-node-config"'
'\n run "numerai node -m <model_name> config" for each node'
'\n run "numerai node -m <model_name> deploy" for each node'
"\nIf you do not want to do this, downgrade to 0.4.1 to continue."
)

raise exception_with_msg(
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

setup(
name="numerai-cli",
version="1.0.2",
version="1.0.3",
description="A library for deploying Numer.ai Prediction Nodes.",
url="https://github.com/numerai/numerai-cli",
author="Numer.ai",
Expand Down

0 comments on commit 131d1bc

Please sign in to comment.