Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions .github/workflows/backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1828,6 +1828,98 @@ jobs:
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# llama-cpp-tq (TurboQuant fork)
- build-type: ''
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-cpu-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "8"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-12-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-nvidia-cuda-13-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "13"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-cuda-13-arm64-llama-cpp-tq'
base-image: "ubuntu:24.04"
runs-on: 'ubuntu-24.04-arm'
ubuntu-version: '2404'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
- build-type: 'hipblas'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64'
tag-latest: 'auto'
tag-suffix: '-gpu-rocm-hipblas-llama-cpp-tq'
runs-on: 'ubuntu-latest'
base-image: "rocm/dev-ubuntu-24.04:6.4.4"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
- build-type: 'cublas'
cuda-major-version: "12"
cuda-minor-version: "0"
platforms: 'linux/arm64'
skip-drivers: 'false'
tag-latest: 'auto'
tag-suffix: '-nvidia-l4t-arm64-llama-cpp-tq'
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
runs-on: 'ubuntu-24.04-arm'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2204'
- build-type: 'vulkan'
cuda-major-version: ""
cuda-minor-version: ""
platforms: 'linux/amd64,linux/arm64'
tag-latest: 'auto'
tag-suffix: '-gpu-vulkan-llama-cpp-tq'
runs-on: 'bigger-runner'
base-image: "ubuntu:24.04"
skip-drivers: 'false'
backend: "llama-cpp-tq"
dockerfile: "./backend/Dockerfile.llama-cpp"
context: "./"
ubuntu-version: '2404'
# Stablediffusion-ggml
- build-type: ''
cuda-major-version: ""
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/bump_deps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ jobs:
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp/Makefile"
- repository: "TheTom/llama-cpp-turboquant"
variable: "LLAMA_VERSION"
branch: "master"
file: "backend/cpp/llama-cpp-tq/Makefile"
branch_suffix: "-tq"
- repository: "ggml-org/whisper.cpp"
variable: "WHISPER_CPP_VERSION"
branch: "master"
Expand Down Expand Up @@ -60,7 +65,7 @@ jobs:
push-to-fork: ci-forks/LocalAI
commit-message: ':arrow_up: Update ${{ matrix.repository }}'
title: 'chore: :arrow_up: Update ${{ matrix.repository }} to `${{ steps.bump.outputs.commit }}`'
branch: "update/${{ matrix.variable }}"
branch: "update/${{ matrix.variable }}${{ matrix.branch_suffix }}"
body: ${{ steps.bump.outputs.message }}
signoff: true

Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ prepare-sources
/backend/cpp/llama-cpp/llama.cpp
/backend/cpp/llama-*
!backend/cpp/llama-cpp
!backend/cpp/llama-cpp-tq
/backends
/backend-images
/result.yaml
Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -544,8 +544,9 @@ backend-images:
mkdir -p backend-images

# Backend metadata: BACKEND_NAME | DOCKERFILE_TYPE | BUILD_CONTEXT | PROGRESS_FLAG | NEEDS_BACKEND_ARG
# llama-cpp is special - uses llama-cpp Dockerfile and doesn't need BACKEND arg
# llama-cpp and forks - use llama-cpp Dockerfile
BACKEND_LLAMA_CPP = llama-cpp|llama-cpp|.|false|false
BACKEND_LLAMA_CPP_TQ = llama-cpp-tq|llama-cpp|.|false|true

# Golang backends
BACKEND_PIPER = piper|golang|.|false|true
Expand Down Expand Up @@ -609,6 +610,7 @@ endef

# Generate all docker-build targets
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP)))
$(eval $(call generate-docker-build-target,$(BACKEND_LLAMA_CPP_TQ)))
$(eval $(call generate-docker-build-target,$(BACKEND_PIPER)))
$(eval $(call generate-docker-build-target,$(BACKEND_LOCAL_STORE)))
$(eval $(call generate-docker-build-target,$(BACKEND_HUGGINGFACE)))
Expand Down
27 changes: 12 additions & 15 deletions backend/Dockerfile.llama-cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@ ARG CUDA_DOCKER_ARCH
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
ARG CMAKE_ARGS
ENV CMAKE_ARGS=${CMAKE_ARGS}
ARG BACKEND=rerankers
ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}
ENV LLAMA_BACKEND_DIR=${LLAMA_BACKEND_DIR}
ARG BUILD_TYPE
ENV BUILD_TYPE=${BUILD_TYPE}
ARG CUDA_MAJOR_VERSION
Expand Down Expand Up @@ -255,32 +257,27 @@ if [[ -n "${CUDA_DOCKER_ARCH:-}" ]]; then
CUDA_ARCH_ESC="${CUDA_DOCKER_ARCH//;/\\;}"
export CMAKE_ARGS="${CMAKE_ARGS:-} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH_ESC}"
echo "CMAKE_ARGS(env) = ${CMAKE_ARGS}"
rm -rf /LocalAI/backend/cpp/llama-cpp-*-build
rm -rf /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}-*-build
fi

cd /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}

if [ "${TARGETARCH}" = "arm64" ] || [ "${BUILD_TYPE}" = "hipblas" ]; then
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
make ARCH=aarch64 build-variants
else
cd /LocalAI/backend/cpp/llama-cpp
make llama-cpp-avx
make llama-cpp-avx2
make llama-cpp-avx512
make llama-cpp-fallback
make llama-cpp-grpc
make llama-cpp-rpc-server
make build-variants
fi
EOT


# Copy libraries using a script to handle architecture differences
RUN make -BC /LocalAI/backend/cpp/llama-cpp package
RUN make -BC /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR} package


FROM scratch

ARG BACKEND=llama-cpp
ARG LLAMA_BACKEND_DIR=${BACKEND}

# Copy all available binaries (the build process only creates the appropriate ones for the target architecture)
COPY --from=builder /LocalAI/backend/cpp/llama-cpp/package/. ./
COPY --from=builder /LocalAI/backend/cpp/${LLAMA_BACKEND_DIR}/package/. ./
6 changes: 6 additions & 0 deletions backend/cpp/llama-cpp-tq/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
LLAMA_VERSION?=master
LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant
BACKEND_NAME?=llama-cpp-tq
SHARED_DIR?=$(CURDIR)/../llama-cpp

include ../llama-cpp/Makefile
5 changes: 5 additions & 0 deletions backend/cpp/llama-cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ add_library(hw_grpc_proto

add_executable(${TARGET} grpc-server.cpp json.hpp httplib.h)

# Enable autoparser support if the header exists (not present in all llama.cpp forks)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/chat-auto-parser.h")
target_compile_definitions(${TARGET} PRIVATE HAS_AUTOPARSER)
endif()

target_include_directories(${TARGET} PRIVATE ../llava)
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})

Expand Down
73 changes: 44 additions & 29 deletions backend/cpp/llama-cpp/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@

LLAMA_VERSION?=a1cfb645307edc61a89e41557f290f441043d3c2
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
BACKEND_NAME?=llama-cpp
SHARED_DIR?=$(CURDIR)
GRPC_SERVER_DIR?=tools/grpc-server
SERVER_SOURCE_DIR?=tools/server

CMAKE_ARGS?=
BUILD_TYPE?=
Expand Down Expand Up @@ -67,6 +71,17 @@ ifeq ($(BUILD_TYPE),sycl_f32)
-DCMAKE_CXX_FLAGS="-fsycl"
endif

# Variants to build for each architecture (can be overridden by forks)
X86_64_VARIANTS ?= llama-cpp-avx llama-cpp-avx2 llama-cpp-avx512 llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server
ARM64_VARIANTS ?= llama-cpp-fallback llama-cpp-grpc llama-cpp-rpc-server

build-variants:
ifeq ($(ARCH),aarch64)
@for v in $(ARM64_VARIANTS); do $(MAKE) $$v || exit 1; done
else
@for v in $(X86_64_VARIANTS); do $(MAKE) $$v || exit 1; done
endif

INSTALLED_PACKAGES=$(CURDIR)/../grpc/installed_packages
INSTALLED_LIB_CMAKE=$(INSTALLED_PACKAGES)/lib/cmake
ADDED_CMAKE_ARGS=-Dabsl_DIR=${INSTALLED_LIB_CMAKE}/absl \
Expand All @@ -90,73 +105,73 @@ else
endif

llama-cpp-avx2: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build purge
$(info ${GREEN}I llama-cpp build info:avx2${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx2-build/grpc-server llama-cpp-avx2
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx2-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx2-build/grpc-server llama-cpp-avx2

llama-cpp-avx512: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build purge
$(info ${GREEN}I llama-cpp build info:avx512${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-cpp-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx512-build/grpc-server llama-cpp-avx512
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="$(BACKEND_NAME)-avx512-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx512-build/grpc-server llama-cpp-avx512

llama-cpp-avx: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build purge
$(info ${GREEN}I llama-cpp build info:avx${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-avx-build/grpc-server llama-cpp-avx
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-avx-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-avx-build/grpc-server llama-cpp-avx

llama-cpp-fallback: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build purge
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="llama-cpp-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-fallback-build/grpc-server llama-cpp-fallback
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) VARIANT="$(BACKEND_NAME)-fallback-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-fallback-build/grpc-server llama-cpp-fallback

llama-cpp-grpc: llama.cpp
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build purge
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME) $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build
$(MAKE) -C $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-cpp-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/grpc-server llama-cpp-grpc
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="$(BACKEND_NAME)-grpc-build" build-llama-cpp-grpc-server
cp -rfv $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/grpc-server llama-cpp-grpc

llama-cpp-rpc-server: llama-cpp-grpc
cp -rf $(CURRENT_MAKEFILE_DIR)/../llama-cpp-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server
cp -rf $(CURRENT_MAKEFILE_DIR)/../$(BACKEND_NAME)-grpc-build/llama.cpp/build/bin/rpc-server llama-cpp-rpc-server

llama.cpp:
mkdir -p llama.cpp
cd llama.cpp && \
git init && \
git remote add origin $(LLAMA_REPO) && \
git fetch origin && \
git checkout -b build $(LLAMA_VERSION) && \
(git checkout -b build $(LLAMA_VERSION) || git checkout -b build origin/$(LLAMA_VERSION)) && \
git submodule update --init --recursive --depth 1 --single-branch

llama.cpp/tools/grpc-server: llama.cpp
mkdir -p llama.cpp/tools/grpc-server
bash prepare.sh
llama.cpp/$(GRPC_SERVER_DIR): llama.cpp
mkdir -p llama.cpp/$(GRPC_SERVER_DIR)
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh

rebuild:
bash prepare.sh
SHARED_DIR=$(SHARED_DIR) SERVER_SOURCE_DIR=$(SERVER_SOURCE_DIR) GRPC_SERVER_DIR=$(GRPC_SERVER_DIR) bash $(SHARED_DIR)/prepare.sh
rm -rf grpc-server
$(MAKE) grpc-server

package:
bash package.sh
bash $(SHARED_DIR)/package.sh

purge:
rm -rf llama.cpp/build
rm -rf llama.cpp/tools/grpc-server
rm -rf llama.cpp/$(GRPC_SERVER_DIR)
rm -rf grpc-server

clean: purge
rm -rf llama.cpp

grpc-server: llama.cpp llama.cpp/tools/grpc-server
grpc-server: llama.cpp llama.cpp/$(GRPC_SERVER_DIR)
@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
ifneq (,$(findstring sycl,$(BUILD_TYPE)))
+bash -c "source $(ONEAPI_VARS); \
Expand Down
4 changes: 4 additions & 0 deletions backend/cpp/llama-cpp/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
#include "backend.pb.h"
#include "backend.grpc.pb.h"
#include "common.h"
#ifdef HAS_AUTOPARSER
#include "chat-auto-parser.h"
#endif
#include <getopt.h>
#include <grpcpp/ext/proto_server_reflection_plugin.h>
#include <grpcpp/grpcpp.h>
Expand Down Expand Up @@ -2665,6 +2667,7 @@ class BackendServiceImpl final : public backend::Backend::Service {

response->set_rendered_template(rendered_template);

#ifdef HAS_AUTOPARSER
// Run differential template analysis to detect tool format markers
if (params_base.use_jinja) {
try {
Expand Down Expand Up @@ -2770,6 +2773,7 @@ class BackendServiceImpl final : public backend::Backend::Service {
SRV_WRN("ModelMetadata: failed to run autoparser analysis: %s\n", e.what());
}
}
#endif

return grpc::Status::OK;
}
Expand Down
Loading
Loading