diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 508aef55a627..30b6d9507034 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -1,6 +1,6 @@ name: Build and Release -on: +on: - push - pull_request @@ -16,15 +16,6 @@ concurrency: jobs: build-linux: - strategy: - matrix: - include: - - build: '' - defines: '' - - build: 'cuda12' - defines: '' - - build: 'cuda11' - defines: '' runs-on: ubuntu-latest steps: - name: Clone @@ -40,17 +31,13 @@ jobs: sudo apt-get update sudo apt-get install build-essential ffmpeg protobuf-compiler - name: Install CUDA Dependencies - if: ${{ matrix.build == 'cuda12' || matrix.build == 'cuda11' }} run: | - if [ "${{ matrix.build }}" == "cuda12" ]; then - export CUDA_VERSION=12-3 - else - export CUDA_VERSION=11-7 - fi curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION} + env: + CUDA_VERSION: 12-3 - name: Cache grpc id: cache-grpc uses: actions/cache@v4 @@ -69,22 +56,15 @@ jobs: cd grpc && cd cmake/build && sudo make --jobs 5 --output-sync=target install - name: Build id: build - env: - BUILD_ID: "${{ matrix.build }}" run: | go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest go install google.golang.org/protobuf/cmd/protoc-gen-go@latest export PATH=$PATH:$GOPATH/bin - if [ "${{ matrix.build }}" == "cuda12" ] || [ "${{ matrix.build }}" == "cuda11" ]; then - export BUILD_TYPE=cublas - export PATH=/usr/local/cuda/bin:$PATH - make dist - else - STATIC=true make dist - fi + export PATH=/usr/local/cuda/bin:$PATH + make dist - uses: actions/upload-artifact@v4 with: - name: LocalAI-linux-${{ matrix.build }} + name: LocalAI-linux path: release/ - name: Release uses: softprops/action-gh-release@v2 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a71467749a15..4d73f21954e5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -57,7 +57,7 @@ jobs: df -h - name: Clone uses: actions/checkout@v4 - with: + with: submodules: true - name: Setup Go ${{ matrix.go-version }} uses: actions/setup-go@v5 @@ -87,6 +87,12 @@ jobs: unzip -j -d /usr/local/bin protoc.zip bin/protoc && \ rm protoc.zip + curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb + sudo apt-get update + sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION} + export CUDACXX=/usr/local/cuda/bin/nvcc + go install google.golang.org/protobuf/cmd/protoc-gen-go@latest go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest @@ -102,6 +108,8 @@ jobs: sudo cp -rfv sources/go-piper/piper-phonemize/pi/lib/. /usr/lib/ && \ # Pre-build stable diffusion before we install a newer version of abseil (not compatible with stablediffusion-ncn) PATH="$PATH:/root/go/bin" GO_TAGS="stablediffusion tts" GRPC_BACKENDS=backend-assets/grpc/stablediffusion make build + env: + CUDA_VERSION: 12-3 - name: Cache grpc id: cache-grpc uses: actions/cache@v4 @@ -166,7 +174,7 @@ jobs: df -h - name: Clone uses: actions/checkout@v4 - with: + with: submodules: true - name: Build images run: | @@ -192,7 +200,7 @@ jobs: steps: - name: Clone uses: actions/checkout@v4 - with: + with: submodules: true - name: Setup Go ${{ matrix.go-version }} uses: actions/setup-go@v5 diff --git a/Makefile b/Makefile index 5af6a6c4bfa1..8140377c6e33 100644 --- a/Makefile +++ b/Makefile @@ -319,7 +319,14 @@ build-minimal: build-api: BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build -dist: build +dist: + STATIC=true $(MAKE) backend-assets/grpc/llama-cpp-avx2 +ifeq ($(OS),Darwin) + $(info ${GREEN}I Skip CUDA build on MacOS${RESET}) +else + $(MAKE) backend-assets/grpc/llama-cpp-cuda +endif + $(MAKE) build mkdir -p release # if BUILD_ID is empty, then we don't append it to the binary name ifeq ($(BUILD_ID),) @@ -677,6 +684,13 @@ ifeq ($(BUILD_TYPE),metal) cp backend/cpp/llama-fallback/llama.cpp/build/bin/default.metallib backend-assets/grpc/ endif +backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc + cp -rf backend/cpp/llama backend/cpp/llama-cuda + $(MAKE) -C backend/cpp/llama-cuda purge + $(info ${GREEN}I llama-cpp build info:cuda${RESET}) + CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server + cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda + backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \ $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/ diff --git a/go.sum b/go.sum index da2cef048842..e8f0c1f224ad 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,6 @@ github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 h1:w+iIsaOQNcT7O github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8= github.com/KyleBanks/depth v1.2.1 h1:5h8fQADFrWtarTdtDudMmGsC7GPbOAu6RVB3ffsVFHc= github.com/KyleBanks/depth v1.2.1/go.mod h1:jzSb9d0L43HxTQfT+oSA1EEp2q+ne2uh6XgeJcm8brE= -github.com/M0Rf30/go-tiny-dream v0.0.0-20231128165230-772a9c0d9aaf h1:UgjXLcE9I+VaVz7uBIlzAnyZIXwiDlIiTWqCh159aUI= -github.com/M0Rf30/go-tiny-dream v0.0.0-20231128165230-772a9c0d9aaf/go.mod h1:UOf2Mb/deUri5agct5OJ4SLWjhI+kZKbsUVUeRb24I0= github.com/Masterminds/goutils v1.1.1 h1:5nUrii3FMTL5diU80unEVvNevw1nH4+ZV4DSLVJLSYI= github.com/Masterminds/goutils v1.1.1/go.mod h1:8cTjp+g8YejhMuvIA5y2vz3BpJxksy863GQaJW2MFNU= github.com/Masterminds/semver/v3 v3.2.0 h1:3MEsd0SM6jqZojhjLWWeBY+Kcjy9i6MQAeY7YgDP83g= @@ -62,8 +60,6 @@ github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKoh github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= -github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df h1:qVcBEZlvp5A1gGWNJj02xyDtbsUI2hohlQMSB1fgER4= -github.com/donomii/go-rwkv.cpp v0.0.0-20230715075832-c898cd0f62df/go.mod h1:gWy7FIWioqYmYxkaoFyBnaKApeZVrUkHhv9EV9pz4dM= github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5 h1:iFaUwBSo5Svw6L7HYpRu/0lE3e0BaElwnNO1qkNQxBY= github.com/dsnet/compress v0.0.2-0.20210315054119-f66993602bf5/go.mod h1:qssHWj60/X5sZFNxpG4HBPDHVqxNm4DfnCKgrbZOT+s= github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= @@ -73,8 +69,6 @@ github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMo github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230628193450-85ed71aaec8e h1:KtbU2JR3lJuXFASHG2+sVLucfMPBjWKUUKByX6C81mQ= -github.com/ggerganov/whisper.cpp/bindings/go v0.0.0-20230628193450-85ed71aaec8e/go.mod h1:QIjZ9OktHFG7p+/m3sMvrAJKKdWrr1fZIK0rM6HZlyo= github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= @@ -99,10 +93,6 @@ github.com/go-openapi/spec v0.21.0 h1:LTVzPc3p/RzRnkQqLRndbAzjY0d0BCL72A6j3CdL9Z github.com/go-openapi/spec v0.21.0/go.mod h1:78u6VdPw81XU44qEWGhtr982gJ5BWg2c0I5XwVMotYk= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= -github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1 h1:yXvc7QfGtoZ51tUW/YVjoTwAfh8HG88XU7UOrbNlz5Y= -github.com/go-skynet/go-bert.cpp v0.0.0-20230716133540-6abe312cded1/go.mod h1:fYjkCDRzC+oRLHSjQoajmYK6AmeJnmEanV27CClAcDc= -github.com/go-skynet/go-llama.cpp v0.0.0-20231009155254-aeba71ee8428 h1:WYjkXL0Nw7dN2uDBMVCWQ8xLavrIhjF/DLczuh5L9TY= -github.com/go-skynet/go-llama.cpp v0.0.0-20231009155254-aeba71ee8428/go.mod h1:iub0ugfTnflE3rcIuqV2pQSo15nEw3GLW/utm5gyERo= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= @@ -226,18 +216,12 @@ github.com/mitchellh/reflectwalk v1.0.0 h1:9D+8oIskB4VJBN5SFlmc27fSlIBZaov1Wpk/I github.com/mitchellh/reflectwalk v1.0.0/go.mod h1:mSTlrgnPZtwu0c4WaC2kGObEpuNDbx0jmZXqmk4esnw= github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 h1:rzf0wL0CHVc8CEsgyygG0Mn9CNCCPZqOPaz8RiiHYQk= github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc= -github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760 h1:OFVkSxR7CRSRSNm5dvpMRZwmSwWa8EMMnHbc84fW5tU= -github.com/mudler/go-piper v0.0.0-20230621222733-56b8a81b4760/go.mod h1:O7SwdSWMilAWhBZMK9N9Y/oBDyMMzshE3ju8Xkexwig= github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c h1:CI5uGwqBpN8N7BrSKC+nmdfw+9nPQIDyjHHlaIiitZI= github.com/mudler/go-processmanager v0.0.0-20230818213616-f204007f963c/go.mod h1:gY3wyrhkRySJtmtI/JPt4a2mKv48h/M9pEZIW+SjeC0= -github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af h1:XFq6OUqsWQam0OrEr05okXsJK/TQur3zoZTHbiZD3Ks= -github.com/mudler/go-stable-diffusion v0.0.0-20230605122230-d89260f598af/go.mod h1:8ufRkpz/S/9ahkaxzZ5i4WMgO9w4InEhuRoT7vK5Rnw= github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= github.com/muesli/termenv v0.15.2 h1:GohcuySI0QmI3wN8Ok9PtKGkgkFIk7y6Vpb5PvrY+Wo= github.com/muesli/termenv v0.15.2/go.mod h1:Epx+iuz8sNs7mNKhxzH4fWXGNpZwUaJKRS1noLXviQ8= -github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530 h1:YXMxHwHMB9jCBo2Yu5gz3mTB3T1TnZs/HmPLv15LUSA= -github.com/nomic-ai/gpt4all/gpt4all-bindings/golang v0.0.0-20231022042237-c25dc5193530/go.mod h1:4T3CHXyrt+7FQHXaxULZfPjHbD8/99WuDDJa0YVZARI= github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ= github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= diff --git a/pkg/model/initializers.go b/pkg/model/initializers.go index 4ccc21145458..115a12a0cf4c 100644 --- a/pkg/model/initializers.go +++ b/pkg/model/initializers.go @@ -11,6 +11,7 @@ import ( "time" grpc "github.com/go-skynet/LocalAI/pkg/grpc" + "github.com/go-skynet/LocalAI/pkg/xsysinfo" "github.com/phayes/freeport" "github.com/rs/zerolog/log" "golang.org/x/sys/cpu" @@ -29,10 +30,12 @@ const ( LlamaGGML = "llama-ggml" LLamaCPP = "llama-cpp" + LLamaCPPCUDA12 = "llama-cpp-cuda12" LLamaCPPAVX2 = "llama-cpp-avx2" LLamaCPPAVX = "llama-cpp-avx" LLamaCPPFallback = "llama-cpp-fallback" + LLamaCPPCUDA = "llama-cpp-cuda" Gpt4AllLlamaBackend = "gpt4all-llama" Gpt4AllMptBackend = "gpt4all-mpt" @@ -72,8 +75,7 @@ ENTRY: } } if !e.IsDir() { - //backends = append(backends, e.Name()) - if !strings.Contains(e.Name(), LLamaCPP) { + if !strings.Contains(e.Name(), LLamaCPP) || strings.Contains(e.Name(), LLamaCPPFallback) { backends[e.Name()] = []string{} } } @@ -104,7 +106,7 @@ ENTRY: // First has more priority priorityList := []string{ // First llama.cpp and llama-ggml - LLamaCPP, LlamaGGML, Gpt4All, + LLamaCPP, LlamaGGML, Gpt4All, LLamaCPPFallback, } toTheEnd := []string{ @@ -190,17 +192,33 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string } else { grpcProcess := backendPath(o.assetDir, backend) + foundCUDA := false // for llama-cpp, check CPU capabilities and load the appropriate variant if backend == LLamaCPP { - if cpu.X86.HasAVX2 { - log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2) - } else if cpu.X86.HasAVX { - log.Info().Msgf("[%s] attempting to load with AVX variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPAVX) - } else { - log.Info().Msgf("[%s] attempting to load with fallback variant", backend) - grpcProcess = backendPath(o.assetDir, LLamaCPPFallback) + gpus, err := xsysinfo.GPUs() + if err == nil { + for _, gpu := range gpus { + if strings.Contains(gpu.String(), "nvidia") { + log.Info().Msgf("[%s] attempting to load with CUDA variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPCUDA) + if _, err := os.Stat(grpcProcess); err == nil { + foundCUDA = true + } + } + } + } + + if !foundCUDA { + if cpu.X86.HasAVX2 { + log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPAVX2) + } else if cpu.X86.HasAVX { + log.Info().Msgf("[%s] attempting to load with AVX variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPAVX) + } else { + log.Info().Msgf("[%s] attempting to load with fallback variant", backend) + grpcProcess = backendPath(o.assetDir, LLamaCPPFallback) + } } }