From a670318a9fb84722f1276f82395946e5f7076203 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Serta=C3=A7=20=C3=96zercan?= <852750+sozercan@users.noreply.github.com> Date: Tue, 14 May 2024 10:40:18 -0700 Subject: [PATCH] feat: auto select llama-cpp cuda runtime (#2306) * auto select cpu variant Signed-off-by: Sertac Ozercan * remove cuda target for now Signed-off-by: Sertac Ozercan * fix metal Signed-off-by: Sertac Ozercan * fix path Signed-off-by: Sertac Ozercan * cuda Signed-off-by: Sertac Ozercan * auto select cuda Signed-off-by: Sertac Ozercan * update test Signed-off-by: Sertac Ozercan * select CUDA backend only if present Signed-off-by: mudler * ci: keep cuda bin in path Signed-off-by: mudler * Makefile: make dist now builds also cuda Signed-off-by: mudler * Keep pushing fallback in case auto-flagset/nvidia fails There could be other reasons for which the default binary may fail. For example we might have detected an Nvidia GPU, however the user might not have the drivers/cuda libraries installed in the system, and so it would fail to start. We keep the fallback of llama.cpp at the end of the llama.cpp backends to try to fallback loading in case things go wrong Signed-off-by: mudler * Do not build cuda on MacOS Signed-off-by: mudler * cleanup Signed-off-by: Sertac Ozercan * Apply suggestions from code review Signed-off-by: Ettore Di Giacinto --------- Signed-off-by: Sertac Ozercan Signed-off-by: Ettore Di Giacinto Signed-off-by: mudler Co-authored-by: Ettore Di Giacinto Co-authored-by: mudler --- .github/workflows/release.yaml | 32 +++++--------------------- .github/workflows/test.yml | 14 +++++++++--- Makefile | 16 ++++++++++++- go.sum | 16 ------------- pkg/model/initializers.go | 42 ++++++++++++++++++++++++---------- 5 files changed, 62 insertions(+), 58 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 508aef55a627..30b6d9507034 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -1,6 +1,6 @@ name: Build and Release -on: +on: - push - pull_request @@ -16,15 +16,6 @@ concurrency: jobs: build-linux: - strategy: - matrix: - include: - - build: '' - defines: '' - - build: 'cuda12' - defines: '' - - build: 'cuda11' - defines: '' runs-on: ubuntu-latest steps: - name: Clone @@ -40,17 +31,13 @@ jobs: sudo apt-get update sudo apt-get install build-essential ffmpeg protobuf-compiler - name: Install CUDA Dependencies - if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }} run: | - if [ "${{ matrix.build }}" == "cuda12" ]; then - export CUDA_VERSION=12-3 - else - export CUDA_VERSION=11-7 - fi curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION} + env: + CUDA_VERSION: 12-3 - name: Cache grpc id: cache-grpc uses: actions/cache@v4 @@ -69,22 +56,15 @@ jobs: cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install - name: Build id: build - env: - BUILD_ID: "${{ matrix.build }}" run: | go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest go install google.golang.org/protobuf/cmd/protoc-gen-go@latest export PATH=$PATH:$GOPATH/bin - if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then - export BUILD_TYPE=cublas - export PATH=/usr/local/cuda/bin:$PATH - make dist - else - STATIC=true make dist - fi + export PATH=/usr/local/cuda/bin:$PATH + make dist - uses: actions/upload-artifact@v4 with: - name: LocalAI-linux-${{ matrix.build }} + name: LocalAI-linux path: release/ - name: Release uses: softprops/action-gh-release@v2 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a71467749a15..4d73f21954e5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -57,7 +57,7 @@ jobs: df -h - name: Clone uses: actions/checkout@v4 - with: + with: submodules: true - name: Setup Go ${{ matrix.go-version }} uses: actions/setup-go@v5 @@ -87,6 +87,12 @@ jobs: unzip -j -d /usr/local/bin protoc.zip bin/protoc && \ rm protoc.zip + curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION} + export CUDACXX=/usr/local/cuda/bin/nvcc + go install google.golang.org/protobuf/cmd/protoc-gen-go@latest go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest @@ -102,6 +108,8 @@ jobs: sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \ # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn) PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build + env: + CUDA_VERSION: 12-3 - name: Cache grpc id: cache-grpc uses: actions/cache@v4 @@ -166,7 +174,7 @@ jobs: df -h - name: Clone uses: actions/checkout@v4 - with: + with: submodules: true - name: Build images run: | @@ -192,7 +200,7 @@ jobs: steps: - name: Clone uses: actions/checkout@v4 - with: + with: submodules: true - name: Setup Go ${{ matrix.go-version }} uses: actions/setup-go@v5 diff --git a/Makefile b/Makefile index 5af6a6c4bfa1..8140377c6e33 100644 --- a/Makefile +++ b/Makefile @@ -319,7 +319,14 @@ build-minimal: build-api: BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build -dist: build +dist: + STATIC=true $(MAKE) backend-assets/grpc/llama-cpp-avx2 +ifeq ($(OS),Darwin) + $(info ${GREEN}I Skip CUDA build on MacOS${RESET}) +else + $(MAKE) backend-assets/grpc/llama-cpp-cuda +endif + $(MAKE) build mkdir -p release # if BUILD_ID is empty, then we don't append it to the binary name ifeq ($(BUILD_ID),) @@ -677,6 +684,13 @@ ifeq ($(BUILD_TYPE),metal) cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/ endif +backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-cuda + $(MAKE) -C backend/cpp/llama-cuda purge + $(info ${GREEN}I llama-cpp build info:cuda${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda + backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/ diff --git a/go.sum b/go.sum index da2cef048842..e8f0c1f224ad 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,6 @@ github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 h1:w+iIsaOQNcT7O github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= -github.com/M0Rf30/go-tiny-dream v0.0.0-20231128165230-772a9c0d9aaf h1:UgjXLcE9I+VaVz7uBIlzAnyZIXwiDlIiTWqCh159aUI= -github.com/M0Rf30/go-tiny-dream v0.0.0-20231128165230-772a9c0d9aaf/go.mod h1:UOf2Mb/deUri5agct5OJ4SLWjhI+kZKbsUVUeRb24I0= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver/v3 v3.2.0 h1:3MEsd0SM6jqZojhjLWWeBY+Kcjy9i6MQAeY7YgDP83g= @@ -62,8 +60,6 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df h1:qVcBEZlvp5A1gGWNJj02xyDtbsUI2hohlQMSB1fgER4= -github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM= github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L7HYpRu/0lE3e0BaElwnNO1qkNQxBY= github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s= github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= @@ -73,8 +69,6 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230628193450-85ed71aaec8e h1:KtbU2JR3lJuXFASHG2+sVLucfMPBjWKUUKByX6C81mQ= -github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230628193450-85ed71aaec8e/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo= github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= @@ -99,10 +93,6 @@ github.com/go-openapi/spec v0.21.0 h1:LTVzPc3p/RzRnkQqLRndbAzjY0d0BCL72A6j3CdL9Z github.com/go-openapi/spec v0.21.0/go.mod h1:78u6VdPw81XU44qEWGhtr982gJ5BWg2c0I5XwVMotYk= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= -github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1 h1:yXvc7QfGtoZ51tUW/YVjoTwAfh8HG88XU7UOrbNlz5Y= -github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1/go.mod h1:fYjkCDRzC+oRLHSjQoajmYK6AmeJnmEanV27CClAcDc= -github.com/go-skynet/go-llama.cpp v0.0.0-20231009155254-aeba71ee8428 h1:WYjkXL0Nw7dN2uDBMVCWQ8xLavrIhjF/DLczuh5L9TY= -github.com/go-skynet/go-llama.cpp v0.0.0-20231009155254-aeba71ee8428/go.mod h1:iub0ugfTnflE3rcIuqV2pQSo15nEw3GLW/utm5gyERo= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= @@ -226,18 +216,12 @@ github.com/mitchellh/reflectwalk v1.0.0 h1:9D+8oIskB4VJBN5SFlmc27fSlIBZaov1Wpk/I github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 h1:rzf0wL0CHVc8CEsgyygG0Mn9CNCCPZqOPaz8RiiHYQk= github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc= -github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760 h1:OFVkSxR7CRSRSNm5dvpMRZwmSwWa8EMMnHbc84fW5tU= -github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760/go.mod h1:O7SwdSWMilAWhBZMK9N9Y/oBDyMMzshE3ju8Xkexwig= github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c h1:CI5uGwqBpN8N7BrSKC+nmdfw+9nPQIDyjHHlaIiitZI= github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c/go.mod h1:gY3wyrhkRySJtmtI/JPt4a2mKv48h/M9pEZIW+SjeC0= -github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af h1:XFq6OUqsWQam0OrEr05okXsJK/TQur3zoZTHbiZD3Ks= -github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af/go.mod h1:8ufRkpz/S/9ahkaxzZ5i4WMgO9w4InEhuRoT7vK5Rnw= github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo= github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8= -github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530 h1:YXMxHwHMB9jCBo2Yu5gz3mTB3T1TnZs/HmPLv15LUSA= -github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI= github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ= github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index 4ccc21145458..115a12a0cf4c 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -11,6 +11,7 @@ import ( "time" grpc "github.com/go-skynet/LocalAI/pkg/grpc" + "github.com/go-skynet/LocalAI/pkg/xsysinfo" "github.com/phayes/freeport" "github.com/rs/zerolog/log" "golang.org/x/sys/cpu" @@ -29,10 +30,12 @@ const ( LlamaGGML = "llama-ggml" LLamaCPP = "llama-cpp" + LLamaCPPCUDA12 = "llama-cpp-cuda12" LLamaCPPAVX2 = "llama-cpp-avx2" LLamaCPPAVX = "llama-cpp-avx" LLamaCPPFallback = "llama-cpp-fallback" + LLamaCPPCUDA = "llama-cpp-cuda" Gpt4AllLlamaBackend = "gpt4all-llama" Gpt4AllMptBackend = "gpt4all-mpt" @@ -72,8 +75,7 @@ ENTRY: } } if !e.IsDir() { - //backends = append(backends, e.Name()) - if !strings.Contains(e.Name(), LLamaCPP) { + if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) { backends[e.Name()] = []string{} } } @@ -104,7 +106,7 @@ ENTRY: // First has more priority priorityList := []string{ // First llama.cpp and llama-ggml - LLamaCPP, LlamaGGML, Gpt4All, + LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback, } toTheEnd := []string{ @@ -190,17 +192,33 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string } else { grpcProcess := backendPath(o.assetDir, backend) + foundCUDA := false // for llama-cpp, check CPU capabilities and load the appropriate variant if backend == LLamaCPP { - if cpu.X86.HasAVX2 { - log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2) - } else if cpu.X86.HasAVX { - log.Info().Msgf("[%s] attempting to load with AVX variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPAVX) - } else { - log.Info().Msgf("[%s] attempting to load with fallback variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPFallback) + gpus, err := xsysinfo.GPUs() + if err == nil { + for _, gpu := range gpus { + if strings.Contains(gpu.String(), "nvidia") { + log.Info().Msgf("[%s] attempting to load with CUDA variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA) + if _, err := os.Stat(grpcProcess); err == nil { + foundCUDA = true + } + } + } + } + + if !foundCUDA { + if cpu.X86.HasAVX2 { + log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2) + } else if cpu.X86.HasAVX { + log.Info().Msgf("[%s] attempting to load with AVX variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPAVX) + } else { + log.Info().Msgf("[%s] attempting to load with fallback variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPFallback) + } } }