Support single-container gRPC serving + HTTP proxy (#546)

## Summary This allows us to deploy a single docker-image service on to clouds like Fly.io and RunPod. - Updated entrypoint to support both gRPC and HTTP serving - Updated default gRPC host to localhost to avoid ipv6 issues within docker - Updated docker-compose files to support single-container gRPC serving + HTTP proxy - Updated default `nos.client.Client()` address to use `localhost:50051` for gRPC and avoid having to manually set for examples. ## Related issues  ## Checks - [ ] `make lint`: I've run `make lint` to lint the changes in this PR. - [ ] `make test`: I've made sure the tests (`make test-cpu` or `make test`) are passing. - Additional tests: - [ ] Benchmark tests (when contributing new models) - [ ] GPU/HW tests
autonomi-ai · Mar 8, 2024 · a83e3ae · a83e3ae
1 parent 148724f
commit a83e3ae
Show file tree

Hide file tree

Showing 22 changed files with 76 additions and 64 deletions.
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ NOS provides an OpenAI-compatible server with streaming support so that you can
 ```python
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 model = client.Module("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
 response = model.chat(message="Tell me a story of 1000 words with emojis", _stream=True)
@@ -105,7 +105,7 @@ Build MidJourney discord bots in seconds.
 ```python
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 sdxl = client.Module("stabilityai/stable-diffusion-xl-base-1-0")
 image, = sdxl(prompts=["hippo with glasses in a library, cartoon styling"],
@@ -146,7 +146,7 @@ Build [scalable semantic search of images/videos](https://docs.nos.run/docs/demo
 ```python
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 clip = client.Module("openai/clip-vit-base-patch32")
 txt_vec = clip.encode_text(texts=["fox jumped over the moon"])
@@ -187,7 +187,7 @@ Perform [real-time audio transcription](./examples/tutorials/04-serving-multiple
 from pathlib import Path
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 model = client.Module("openai/whisper-small.en")
 with client.UploadFile(Path("audio.wav")) as remote_path:
@@ -225,7 +225,7 @@ Run classical computer-vision tasks in 2 lines of code.
 from pathlib import Path
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 model = client.Module("yolox/medium")
 response = model(images=[Image.open("image.jpg")])

diff --git a/docs/OVERVIEW.md b/docs/OVERVIEW.md
@@ -76,7 +76,7 @@ This command pulls and starts the latest GPU docker server with all the NOS good
 ```python
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 sdxl = client.Module("stabilityai/stable-diffusion-xl-base-1-0")
 image, = sdxl(prompts=["fox jumped over the moon"],
@@ -119,7 +119,7 @@ curl \
 ```python
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 clip = client.Module("openai/clip-vit-base-patch32")
 txt_vec = clip.encode_text(texts=["fox jumped over the moon"])
@@ -159,7 +159,7 @@ curl \
 from pathlib import Path
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 model = client.Module("openai/whisper-large-v2")
 with client.UploadFile(Path("audio.wav")) as remote_path:
@@ -197,7 +197,7 @@ curl \
 from pathlib import Path
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 
 model = client.Module("yolox/medium")
 response = model(images=[Image.open("image.jpg")])

diff --git a/docs/guides/running-inference.md b/docs/guides/running-inference.md
@@ -16,7 +16,7 @@ Let's start by importing the NOS client and creating an `Client` instance. The c
 from nos.client import Client, TaskType
 
 # Create a client that connects to the server via gRPC (over 50051)
-client = Client("[::]:50051")
+client = Client()
 
 # We provide helper functions to wait for the server to be ready
 # if the server is simultaneously spun up in a separate process.

diff --git a/docs/guides/serving-custom-models.md b/docs/guides/serving-custom-models.md
@@ -95,7 +95,7 @@ Via the `serve.yaml`, NOS automatically registers the new **WhisperX** model und
 ```python linenums="1" title="client.py"
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 ```
 
 !!!note
@@ -126,7 +126,7 @@ Once the model is served, we can use the client to call the `transcribe` method
 ```python linenums="1" title="client.py"
 from nos.client import Client
 
-client = Client("[::]:50051")
+client = Client()
 client.WaitForServer()  # Wait for the server to start
 
 model = client.Module("mbain-whisperx")

diff --git a/examples/docker/docker-compose.cpu.yml b/examples/docker/docker-compose.cpu.yml
@@ -1,7 +1,7 @@
 version: "3.8"
 
 services:
-  nos-server-cpu:
+  server-cpu:
     image: autonomi/nos:latest-cpu
     environment:
       - NOS_HOME=/app/.nos

diff --git a/examples/docker/docker-compose.gpu-with-gateway.yml b/examples/docker/docker-compose.gpu-with-gateway.yml
@@ -1,33 +1,18 @@
 version: "3.8"
 
 services:
-  nos-http-gateway:
+  server:
     image: autonomi/nos:latest-gpu
-    command: nos-http-server --host 0.0.0.0 --port 8000 --workers 1
+    command: /app/entrypoint.sh --http
     environment:
       - NOS_HOME=/app/.nos
       - NOS_LOGGING_LEVEL=INFO
-      - NOS_GRPC_HOST=nos-server
     volumes:
       - ~/.nosd:/app/.nos
       - /dev/shm:/dev/shm
     ports:
       - 8000:8000
     ipc: host
-    depends_on:
-      - nos-server
-
-  nos-server:
-    image: autonomi/nos:latest-gpu
-    environment:
-      - NOS_HOME=/app/.nos
-      - NOS_LOGGING_LEVEL=INFO
-    volumes:
-      - ~/.nosd:/app/.nos
-      - /dev/shm:/dev/shm
-    ports:
-      - 50051:50051
-    ipc: host
     deploy:
       resources:
         reservations:

diff --git a/examples/docker/docker-compose.gpu.yml b/examples/docker/docker-compose.gpu.yml
@@ -1,7 +1,7 @@
 version: "3.8"
 
 services:
-  nos-server-gpu:
+  server-gpu:
     image: autonomi/nos:latest-gpu
     environment:
       - NOS_HOME=/app/.nos

diff --git a/examples/inf2/embeddings/tests/test_embeddings_inf2_client.py b/examples/inf2/embeddings/tests/test_embeddings_inf2_client.py
@@ -8,7 +8,7 @@ def test_embeddings_inf2_client(model_id):
     from nos.client import Client
 
     # Create a client
-    client = Client("[::]:50051")
+    client = Client()
     assert client.WaitForServer()
 
     # Load the embeddings model

diff --git a/examples/inf2/sdxl/tests/test_sdxl_inf2_client.py b/examples/inf2/sdxl/tests/test_sdxl_inf2_client.py
@@ -8,7 +8,7 @@ def test_sdxl_inf2_client(model_id):
     from nos.client import Client
 
     # Create a client
-    client = Client("[::]:50051")
+    client = Client()
     assert client.WaitForServer()
 
     # Load the embeddings model

diff --git a/examples/tutorials/01-serving-custom-models/tests/test_model.py b/examples/tutorials/01-serving-custom-models/tests/test_model.py
@@ -10,7 +10,7 @@
 if __name__ == "__main__":
     from typing import List
 
-    client = Client("[::]:50051")
+    client = Client()
     assert client is not None
     assert client.WaitForServer()
     assert client.IsHealthy()

diff --git a/examples/tutorials/02-serving-multiple-methods/tests/test_model.py b/examples/tutorials/02-serving-multiple-methods/tests/test_model.py
@@ -8,7 +8,7 @@
 
 
 if __name__ == "__main__":
-    client = Client("[::]:50051")
+    client = Client()
     assert client is not None
     assert client.WaitForServer()
 

diff --git a/examples/tutorials/03-llm-streaming-chat/tests/test_grpc_chat.py b/examples/tutorials/03-llm-streaming-chat/tests/test_grpc_chat.py
@@ -5,12 +5,11 @@
 from nos.client import Client
 
 
-GRPC_PORT = 50051
 model_id = "tinyllama-1.1b-chat"
 
 
 if __name__ == "__main__":
-    client = Client(f"[::]:{GRPC_PORT}")
+    client = Client()
     assert client.WaitForServer()
 
     # Load the llama chat model

diff --git a/examples/tutorials/04-serving-multiple-models/summarize_audio.py b/examples/tutorials/04-serving-multiple-models/summarize_audio.py
@@ -17,9 +17,8 @@
     path = Path(args.filename)
 
     # Create a client
-    address = "[::]:50051"
-    print(f"Connecting to client at {address} ...")
-    client = Client(address)
+    print("Connecting to client...")
+    client = Client()
     client.WaitForServer()
 
     # Transcribe with Whisper

diff --git a/nos/cli/predict.py b/nos/cli/predict.py
@@ -19,6 +19,7 @@
 
 from nos.client import Client
 from nos.common.exceptions import ClientException
+from nos.constants import DEFAULT_GRPC_ADDRESS
 
 
 predict_cli = typer.Typer(name="predict", help="NOS gRPC Prediction CLI.", no_args_is_help=True)
@@ -36,7 +37,7 @@ class gRPCConfig:
 @predict_cli.callback()
 def grpc_config(
     ctx: typer.Context,
-    address: str = typer.Option("[::]:50051", "-a", "--address", help="Address of the gRPC server."),
+    address: str = typer.Option(DEFAULT_GRPC_ADDRESS, "-a", "--address", help="Address of the gRPC server."),
 ):
     """Common gRPC options"""
     client = Client(address)

diff --git a/nos/cli/templates/docker-compose.serve.yml.j2 b/nos/cli/templates/docker-compose.serve.yml.j2
@@ -40,6 +40,7 @@ services:
     environment:
       - NOS_HOME=/app/.nos
       - NOS_LOGGING_LEVEL={{ logging_level }}
+      - NOS_GRPC_HOST=[::]
     {%- if env_file|length > 0 %}
     env_file:
       {%- for envf in env_file %}

diff --git a/nos/client/grpc.py b/nos/client/grpc.py
@@ -20,7 +20,7 @@
     ServerReadyException,
 )
 from nos.common.shm import NOS_SHM_ENABLED, SharedMemoryTransportManager
-from nos.constants import DEFAULT_GRPC_PORT, GRPC_MAX_MESSAGE_LENGTH, NOS_PROFILING_ENABLED
+from nos.constants import DEFAULT_GRPC_ADDRESS, GRPC_MAX_MESSAGE_LENGTH, NOS_PROFILING_ENABLED
 from nos.logging import logger
 from nos.protoc import import_module
 from nos.version import __version__
@@ -64,11 +64,11 @@ class Client:
         ```
     """
 
-    def __init__(self, address: str = f"[::]:{DEFAULT_GRPC_PORT}"):
+    def __init__(self, address: str = DEFAULT_GRPC_ADDRESS):
         """Initializes the gRPC client.
 
         Args:
-            address (str): Address for the gRPC server. Defaults to f"[::]:{DEFAULT_GRPC_PORT}".
+            address (str): Address for the gRPC server. Defaults to DEFAULT_GRPC_ADDRESS.
         """
         self.address: str = address
         self._channel: grpc.Channel = None

diff --git a/nos/constants.py b/nos/constants.py
@@ -15,11 +15,11 @@
 NOS_LOG_DIR.mkdir(parents=True, exist_ok=True)
 NOS_TMP_DIR.mkdir(parents=True, exist_ok=True)
 
-DEFAULT_HTTP_HOST = os.getenv("NOS_HTTP_HOST", "127.0.0.1")
+DEFAULT_HTTP_HOST = os.getenv("NOS_HTTP_HOST", "localhost")
 DEFAULT_HTTP_PORT = int(os.getenv("NOS_HTTP_PORT", 8000))
 DEFAULT_HTTP_ADDRESS = f"{DEFAULT_HTTP_HOST}:{DEFAULT_HTTP_PORT}"
 
-DEFAULT_GRPC_HOST = os.getenv("NOS_GRPC_HOST", "[::]")
+DEFAULT_GRPC_HOST = os.getenv("NOS_GRPC_HOST", "localhost")
 DEFAULT_GRPC_PORT = int(os.getenv("NOS_GRPC_PORT", 50051))
 DEFAULT_GRPC_ADDRESS = f"{DEFAULT_GRPC_HOST}:{DEFAULT_GRPC_PORT}"
 

diff --git a/nos/test/conftest.py b/nos/test/conftest.py
@@ -1,7 +1,7 @@
 import pytest
 from loguru import logger
 
-from nos.constants import DEFAULT_GRPC_PORT
+from nos.constants import DEFAULT_GRPC_HOST, DEFAULT_GRPC_PORT
 from nos.protoc import import_module
 
 
@@ -62,7 +62,7 @@ async def grpc_server(ray_executor):
         ("grpc.max_receive_message_length", GRPC_MAX_MESSAGE_LENGTH),
     ]
     server = aio.server(options=options)
-    address = f"[::]:{GRPC_TEST_PORT}"
+    address = f"{DEFAULT_GRPC_HOST}:{GRPC_TEST_PORT}"
     nos_service_pb2_grpc.add_InferenceServiceServicer_to_server(InferenceServiceImpl(), server)
     server.add_insecure_port(address)
 
@@ -78,23 +78,23 @@ def grpc_client():
     """Test gRPC client (Port: 50052)."""
     from nos.client import Client
 
-    yield Client(f"[::]:{GRPC_TEST_PORT}")
+    yield Client(f"{DEFAULT_GRPC_HOST}:{GRPC_TEST_PORT}")
 
 
 @pytest.fixture(scope="session")
 def grpc_client_cpu():
     """Test gRPC client to be used with CPU docker runtime (Port: 50053)."""
     from nos.client import Client
 
-    yield Client(f"[::]:{GRPC_TEST_PORT_CPU}")
+    yield Client(f"{DEFAULT_GRPC_HOST}:{GRPC_TEST_PORT_CPU}")
 
 
 @pytest.fixture(scope="session")
 def grpc_client_gpu():
     """Test gRPC client to be used with GPU docker runtime (Port: 50054)."""
     from nos.client import Client
 
-    yield Client(f"[::]:{GRPC_TEST_PORT_GPU}")
+    yield Client(f"{DEFAULT_GRPC_HOST}:{GRPC_TEST_PORT_GPU}")
 
 
 @pytest.fixture(scope="session")
@@ -230,7 +230,7 @@ def local_http_client_with_server(grpc_server):  # noqa: F811
     from nos.server.http._service import app_factory
 
     # Yield the HTTP client once the server is up and initialized
-    with TestClient(app_factory(address=f"[::]:{GRPC_TEST_PORT}")) as _client:
+    with TestClient(app_factory(address=f"{DEFAULT_GRPC_HOST}:{GRPC_TEST_PORT}")) as _client:
         yield _client
 
 
@@ -247,7 +247,7 @@ def http_client_with_cpu_backend(grpc_server_docker_runtime_cpu):  # noqa: F811
     from nos.server.http._service import app_factory
 
     # Yield the HTTP client once the server is up and initialized
-    with TestClient(app_factory(address=f"[::]:{GRPC_TEST_PORT_CPU}")) as _client:
+    with TestClient(app_factory(address=f"{DEFAULT_GRPC_HOST}:{GRPC_TEST_PORT_CPU}")) as _client:
         yield _client
 
 
@@ -264,7 +264,7 @@ def http_client_with_gpu_backend(grpc_server_docker_runtime_gpu):  # noqa: F811
     from nos.server.http._service import app_factory
 
     # Yield the HTTP client once the server is up and initialized
-    with TestClient(app_factory(address=f"[::]:{GRPC_TEST_PORT_GPU}")) as _client:
+    with TestClient(app_factory(address=f"{DEFAULT_GRPC_HOST}:{GRPC_TEST_PORT_GPU}")) as _client:
         yield _client
 
 
@@ -301,7 +301,7 @@ def http_server_with_gpu_backend(grpc_client_with_gpu_backend):  # noqa: F811
 
     def _run_uvicorn_server():
         uvicorn.run(
-            app_factory(address=f"[::]:{GRPC_TEST_PORT_GPU}", env="dev"),
+            app_factory(address=f"{DEFAULT_GRPC_HOST}:{GRPC_TEST_PORT_GPU}", env="dev"),
             host="localhost",
             port=HTTP_TEST_PORT,
             workers=1,

diff --git a/nos/version.py b/nos/version.py
@@ -1 +1 @@
-__version__ = "0.2.0"
+__version__ = "0.2.1"