The Go Triton Client is a robust and versatile Software Development Kit (SDK) designed to facilitate seamless integration with NVIDIA's Triton Inference Server. This SDK provides both gRPC and HTTP clients, enabling Go developers to interact with Triton for deploying and managing machine learning models with ease.
NVIDIA Triton Inference Server is an open-source inference serving software that simplifies the deployment of AI models at scale. Triton supports models from multiple frameworks (TensorFlow, PyTorch, ONNX Runtime, etc.) and provides features such as dynamic batching, concurrent model execution, and model optimization to maximize the utilization of compute resources.
The Go Triton Client allows Go developers to interact with the Triton Inference Server directly from Go applications. With support for both HTTP and gRPC protocols, it provides a flexible and efficient way to manage models and perform inference requests, leveraging Go's concurrency features and performance benefits.
- NVIDIA Triton Inference Server: https://github.com/triton-inference-server/server
- Dual Protocol Support: Interact with Triton using both gRPC and HTTP protocols.
- Comprehensive Model Management: Load, unload, and query models effortlessly.
- Inference Requests: Perform synchronous and asynchronous inferences with customizable parameters.
- Health Monitoring: Check server and model readiness and liveness.
- Logging Control: Retrieve and update server logging settings.
- Extensible Design: Easily extendable to support additional Triton features.
- Go: Ensure you have Go installed (version 1.16 or higher is recommended). Download Go
- Triton Inference Server: Running instance of Triton Inference Server. Triton Installation Guide
You can install the Go Triton Client using go get
:
go get github.com/Trendyol/go-triton-client
The SDK provides two types of clients: gRPC and HTTP. Choose the one that best fits your application's communication needs.
import (
"github.com/Trendyol/go-triton-client/client/http"
"log"
)
func createHTTPClient() {
client, err := http.NewClient(
"triton-server.example.com", // Triton HTTP endpoint (without scheme)
false, // verbose logging
3000, // connection timeout in second
3000, // network timeout in second
true, // use SSL
true, // insecure connection
nil, // custom HTTP client (optional)
nil, // logger (optional)
)
if err != nil {
log.Fatalf("Failed to create HTTP client: %v", err)
}
// Use the client...
}
import (
"github.com/Trendyol/go-triton-client/client/grpc"
"log"
)
func createGRPCClient() {
client, err := grpc.NewClient(
"triton-server.example.com:8001", // Triton gRPC endpoint
false, // verbose logging
3, // connection timeout in second
3, // network timeout in second
true, // use SSL
true, // insecure connection
nil, // existing gRPC connection (optional)
nil, // logger (optional)
)
if err != nil {
log.Fatalf("Failed to create gRPC client: %v", err)
}
// Use the client...
}
Before performing any operations, it's good practice to check the server's health.
import (
"context"
"fmt"
"github.com/Trendyol/go-triton-client/client/grpc" // or "github.com/Trendyol/go-triton-client/client/http"
"log"
"github.com/Trendyol/go-triton-client/options"
)
func checkServerHealth(tritonClient base.Client) {
isLive, err := tritonClient.IsServerLive(context.Background(), &options.IsServerLiveOptions{})
if err != nil {
log.Fatalf("Error checking server liveness: %v", err)
}
fmt.Printf("Server is live: %v\n", isLive)
isReady, err := tritonClient.IsServerReady(context.Background(), &options.IsServerReadyOptions{})
if err != nil {
log.Fatalf("Error checking server readiness: %v", err)
}
fmt.Printf("Server is ready: %v\n", isReady)
}
Performing inference involves preparing input data, specifying desired outputs, and handling the response.
package main
import (
"context"
"fmt"
"log"
"github.com/Trendyol/go-triton-client/base"
"github.com/Trendyol/go-triton-client/client/http"
"github.com/Trendyol/go-triton-client/options"
"github.com/Trendyol/go-triton-client/parser"
"github.com/Trendyol/go-triton-client/postprocess"
)
func main() {
client, err := http.NewClient(
"triton-server.example.com",
false,
200,
200,
true,
true,
nil,
nil,
)
if err != nil {
log.Fatalf("Failed to create HTTP client: %v", err)
}
// Perform inference
performInference(client)
}
func performInference(tritonClient base.Client) {
inputIds := http.NewInferInput("input_ids", "INT64", []int64{2, 3}, nil)
err := inputIds.SetData([]int{101, 202536, 102, 101, 202536, 102}, true)
if err != nil {
log.Fatal(err)
}
tokenTypeIds := http.NewInferInput("token_type_ids", "INT64", []int64{2, 3}, nil)
err = tokenTypeIds.SetData([]int{0, 0, 0, 0, 0, 0}, true)
if err != nil {
log.Fatal(err)
}
attentionMask := http.NewInferInput("attention_mask", "INT64", []int64{2, 3}, nil)
err = attentionMask.SetData([]int{1, 1, 1, 1, 1, 1}, true)
if err != nil {
log.Fatal(err)
}
outputs := []base.InferOutput{
http.NewInferOutput("logits", map[string]interface{}{"binary_data": true}),
}
response, err := tritonClient.Infer(
context.Background(),
"ty_bert",
"1",
[]base.InferInput{inputIds, tokenTypeIds, attentionMask},
outputs,
nil,
)
if err != nil {
log.Fatal(err)
}
sliceResp, err := response.AsSlice("logits")
if err != nil {
log.Fatal(err)
}
embeddings, ok := parser.ParseSlice[[][][]float64](sliceResp)
if !ok {
log.Fatal("Failed to parse inference response")
}
postprocessManager := postprocess.NewPostprocessManager()
convertedEmbeddings := postprocessManager.Float64ToFloat32Slice3D(embeddings)
meanPooledEmbeddings, err := postprocessManager.MeanPoolingFloat32Slice3D(convertedEmbeddings, [][]int64{{1, 1, 1}, {1, 1, 1}})
if err != nil {
log.Fatal(err)
}
fmt.Println(meanPooledEmbeddings)
}
package main
import (
"context"
"fmt"
"log"
"github.com/Trendyol/go-triton-client/base"
"github.com/Trendyol/go-triton-client/client/grpc"
"github.com/Trendyol/go-triton-client/options"
"github.com/Trendyol/go-triton-client/parser"
"github.com/Trendyol/go-triton-client/postprocess"
)
func main() {
client, err := grpc.NewClient(
"triton-server.example.com:8001",
false,
200,
200,
true,
true,
nil,
nil,
)
if err != nil {
log.Fatalf("Failed to create gRPC client: %v", err)
}
// Perform inference
performInference(client)
}
func performInference(tritonClient base.Client) {
input := grpc.NewInferInput("input_ids", "INT64", []int64{1, 3}, nil)
err := input.SetData([]int{101, 202536, 102}, true) // Example data
if err != nil {
log.Fatal(err)
}
outputs := []base.InferOutput{
grpc.NewInferOutput("output_name", map[string]interface{}{"binary_data": true}),
}
response, err := tritonClient.Infer(
context.Background(),
"model_name",
"model_version",
[]base.InferInput{input},
outputs,
nil,
)
if err != nil {
log.Fatal(err)
}
// Process response
sliceResp, err := response.AsSlice("output_name")
if err != nil {
log.Fatal(err)
}
// Parse the response
parsedData, ok := parser.ParseSlice[[][]float64](sliceResp)
if !ok {
log.Fatal("Failed to parse inference response")
}
// Post-process if needed
postprocessManager := postprocess.NewPostprocessManager()
processedData := postprocessManager.Float64ToFloat32Slice2D(parsedData)
fmt.Println(processedData)
}
The SDK supports various data types such as INT64, BYTES, FP32, etc. Ensure that the data types match those expected by your Triton models.
You can pass custom parameters to inference requests to control model behavior.
customParams := map[string]interface{}{
"classification": 5,
"binary_data": true,
}
response, err := tritonClient.Infer(
context.Background(),
"model_name",
"1",
inputs,
outputs,
nil, nil, nil, nil, nil, nil, nil, nil, customParams,
&options.InferOptions{},
)
The SDK provides a flexible parser to handle different output shapes and types. This parser can be used to handle complex nested slices and convert them to the desired type for further processing.
import (
"github.com/Trendyol/go-triton-client/parser"
)
// Assume `sliceResp` is the raw response slice from Triton
sliceResp, err := response.AsSlice("output_name")
if err != nil {
log.Fatal(err)
}
// Parse the response into the desired type, e.g., a 3D slice of float64
parsedData, ok := parser.ParseSlice[[][][]float64](sliceResp)
if !ok {
log.Fatal("Failed to parse inference response")
}
// Now `parsedData` holds the parsed output which you can use for further processing
This section demonstrates how to set up the Triton Inference Server with the ty_bert
and ty_roberta
models from HuggingFace, use the tokenizer
package to encode text, perform inference, and retrieve the results
go get github.com/Trendyol/go-triton-client/examples
cd examples/end2end_inference
docker build -t go-triton-client-end2end-inference .
docker run -p 8000:8000 -p 8001:8001 -p 8002:8002 go-triton-client-end2end-inference
Contributions are welcome! If you find any issues or have suggestions for improvements, feel free to open an issue or submit a pull request.
Click the "Fork" button at the top right of the repository page to create a copy in your GitHub account.
git clone https://github.com/your-username/go-triton-client.git
cd go-triton-client
git checkout -b feature/your-feature-name
Implement your feature or fix the bug.
Ensure all tests pass.
go test ./...
git add .
git commit -m "Add feature: your feature description"
git push origin feature/your-feature-name