diff --git a/Makefile b/Makefile
index 2aee18cc749f0..564f50afdc44e 100644
--- a/Makefile
+++ b/Makefile
@@ -49,17 +49,23 @@
 #  % MO_CL_CUDA=1 make
 
 # where am I
+ifeq ($(GO),)
+	GO=go
+endif
+
 ROOT_DIR = $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 BIN_NAME := mo-service
 UNAME_S := $(shell uname -s | tr A-Z a-z)
 UNAME_M := $(shell uname -m)
-GOPATH := $(shell go env GOPATH)
-GO_VERSION=$(shell go version)
+GOPATH := $(shell $(GO) env GOPATH)
+GO_VERSION=$(shell $(GO) version)
 BRANCH_NAME=$(shell git rev-parse --abbrev-ref HEAD)
 LAST_COMMIT_ID=$(shell git rev-parse --short HEAD)
 BUILD_TIME=$(shell date +%s)
 MO_VERSION=$(shell git symbolic-ref -q --short HEAD || git describe --tags --exact-match)
-GO_MODULE=$(shell go list -m)
+GO_MODULE=$(shell $(GO) list -m)
+GO_MAJOR_VERSION = $(shell $(GO) version | cut -c 14- | cut -d' ' -f1 | cut -d'.' -f1)
+GO_MINOR_VERSION = $(shell $(GO) version | cut -c 14- | cut -d' ' -f1 | cut -d'.' -f2)
 
 # check the MUSL_TARGET from https://musl.cc
 # make MUSL_TARGET=aarch64-linux musl to cross make the aarch64 linux executable
@@ -78,6 +84,7 @@ ifneq ($(GOARCH)$(TARGET_ARCH)$(GOOS)$(TARGET_OS),)
 $(error cross compilation has been disabled)
 endif
 
+
 ###############################################################################
 # default target
 ###############################################################################
@@ -151,8 +158,8 @@ help:
 
 .PHONY: vendor-build
 vendor-build:
-	$(info [go mod vendor])
-	@go mod vendor
+	$(info [$(GO) mod vendor])
+	@$(GO) mod vendor
 
 ###############################################################################
 # code generation
@@ -161,7 +168,7 @@ vendor-build:
 .PHONY: config
 config:
 	$(info [Create build config])
-	@go mod tidy
+	@$(GO) mod tidy
 
 .PHONY: generate-pb
 generate-pb:
@@ -178,37 +185,55 @@ pb: vendor-build generate-pb fmt
 
 VERSION_INFO :=-X '$(GO_MODULE)/pkg/version.GoVersion=$(GO_VERSION)' -X '$(GO_MODULE)/pkg/version.BranchName=$(BRANCH_NAME)' -X '$(GO_MODULE)/pkg/version.CommitID=$(LAST_COMMIT_ID)' -X '$(GO_MODULE)/pkg/version.BuildTime=$(BUILD_TIME)' -X '$(GO_MODULE)/pkg/version.Version=$(MO_VERSION)'
 THIRDPARTIES_INSTALL_DIR=$(ROOT_DIR)/thirdparties/install
+CGO_DIR=$(ROOT_DIR)/cgo
 RACE_OPT :=
 DEBUG_OPT :=
 CGO_DEBUG_OPT :=
 TAGS :=
+GOTAGS :=
+GOEXPERIMENT_OPT :=
+
+ifeq ("$(UNAME_M)", "x86_64")
+  ifeq ($(shell expr $(GO_MAJOR_VERSION) \>= 1), 1)
+    ifeq ($(shell expr $(GO_MINOR_VERSION) \>= 26), 1)
+	#GOEXPERIMENT_OPT=GOEXPERIMENT=simd
+    endif
+  endif
+  ifneq ($(GOAMD64),)
+	GOEXPERIMENT_OPT+=GOAMD64=$(GOAMD64)
+  endif
+endif
 
 ifeq ($(MO_CL_CUDA),1)
   ifeq ($(CONDA_PREFIX),)
     $(error CONDA_PREFIX env variable not found.)
   endif
 	CUVS_CFLAGS := -I$(CONDA_PREFIX)/include
-	CUVS_LDFLAGS := -L$(CONDA_PREFIX)/envs/go/lib -lcuvs -lcuvs_c
+	CUVS_LDFLAGS := -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c
 	CUDA_CFLAGS := -I/usr/local/cuda/include $(CUVS_CFLAGS)
 	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart $(CUVS_LDFLAGS) -lstdc++
-	TAGS += -tags "gpu"
+	TAGS += gpu
 endif
 
 ifeq ($(TYPECHECK),1)
-	TAGS += -tags "typecheck"
+	TAGS += typecheck
 endif
 
-CGO_OPTS :=CGO_CFLAGS="-I$(THIRDPARTIES_INSTALL_DIR)/include $(CUDA_CFLAGS)"
-GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib -fopenmp' $(VERSION_INFO)"
+CGO_OPTS :=CGO_CFLAGS="-I$(CGO_DIR) -I$(THIRDPARTIES_INSTALL_DIR)/include $(CUDA_CFLAGS)"
+GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(CGO_DIR) -lmo -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib -fopenmp' $(VERSION_INFO)"
 
 ifeq ("$(UNAME_S)","darwin")
-GOLDFLAGS:=-ldflags="-extldflags '-L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,@executable_path/lib' $(VERSION_INFO)"
+GOLDFLAGS:=-ldflags="-extldflags '-L$(CGO_DIR) -lmo -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,@executable_path/lib' $(VERSION_INFO)"
 endif
 
 ifeq ($(GOBUILD_OPT),)
 	GOBUILD_OPT :=
 endif
 
+ifneq ($(TAGS),)
+	GOTAGS := -tags "$(TAGS)"
+endif
+
 .PHONY: cgo
 cgo: thirdparties
 	@(cd cgo; ${MAKE} ${CGO_DEBUG_OPT})
@@ -222,7 +247,7 @@ thirdparties:
 .PHONY: build
 build: config cgo thirdparties
 	$(info [Build binary])
-	$(CGO_OPTS) go build $(TAGS) $(RACE_OPT) $(GOLDFLAGS) $(DEBUG_OPT) $(GOBUILD_OPT) -o $(BIN_NAME) ./cmd/mo-service
+	$(GOEXPERIMENT_OPT) $(CGO_OPTS) $(GO) build $(GOTAGS) $(RACE_OPT) $(GOLDFLAGS) $(DEBUG_OPT) $(GOBUILD_OPT) -o $(BIN_NAME) ./cmd/mo-service
 
 # https://wiki.musl-libc.org/getting-started.html
 # https://musl.cc/
@@ -248,17 +273,17 @@ musl-thirdparties: musl-install
 .PHONY: musl
 musl: override CGO_OPTS += CC=$(MUSL_CC)
 musl: override GOLDFLAGS:=-ldflags="--linkmode 'external' --extldflags '-static -L$(THIRDPARTIES_INSTALL_DIR)/lib -lstdc++ -Wl,-rpath,\$${ORIGIN}/lib' $(VERSION_INFO)"
-musl: override TAGS := -tags musl
+musl: override GOTAGS := -tags musl
 musl: musl-install musl-cgo config musl-thirdparties
 musl:
 	$(info [Build binary(musl)])
-	$(CGO_OPTS) go build $(TAGS) $(RACE_OPT) $(GOLDFLAGS) $(DEBUG_OPT) $(GOBUILD_OPT) -o $(BIN_NAME) ./cmd/mo-service
+	$(CGO_OPTS) $(GO) build $(GOTAGS) $(RACE_OPT) $(GOLDFLAGS) $(DEBUG_OPT) $(GOBUILD_OPT) -o $(BIN_NAME) ./cmd/mo-service
 
 # build mo-tool
 .PHONY: mo-tool
 mo-tool: config cgo thirdparties
 	$(info [Build mo-tool tool])
-	$(CGO_OPTS) go build $(GOLDFLAGS) -o mo-tool ./cmd/mo-tool
+	$(CGO_OPTS) $(GO) build $(GOLDFLAGS) -o mo-tool ./cmd/mo-tool
 
 # build mo-service binary for debugging with go's race detector enabled
 # produced executable is 10x slower and consumes much more memory
@@ -1007,7 +1032,7 @@ launch-minio-debug: debug dev-up-minio-local
 clean:
 	$(info [Clean up])
 	$(info Clean go test cache)
-	@go clean -testcache
+	@$(GO) clean -testcache
 	rm -f $(BIN_NAME)
 	rm -rf $(ROOT_DIR)/vendor
 	rm -rf $(MUSL_DIR)
@@ -1027,12 +1052,12 @@ fmt:
 .PHONY: install-static-check-tools
 install-static-check-tools:
 	@curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | bash -s -- -b $(GOPATH)/bin v2.6.2
-	@go install github.com/matrixorigin/linter/cmd/molint@latest
-	@go install github.com/apache/skywalking-eyes/cmd/license-eye@v0.4.0
+	@$(GO) install github.com/matrixorigin/linter/cmd/molint@latest
+	@$(GO) install github.com/apache/skywalking-eyes/cmd/license-eye@v0.4.0
 
 .PHONY: static-check
 static-check: config err-check
-	$(CGO_OPTS) go vet -vettool=`which molint` ./...
+	$(CGO_OPTS) $(GO) vet -vettool=`which molint` ./...
 	$(CGO_OPTS) license-eye -c .licenserc.yml header check
 	$(CGO_OPTS) license-eye -c .licenserc.yml dep check
 	$(CGO_OPTS) golangci-lint run -v -c .golangci.yml ./...
diff --git a/cgo/Makefile b/cgo/Makefile
index 5678f16cf5814..d25f0400aab96 100644
--- a/cgo/Makefile
+++ b/cgo/Makefile
@@ -1,48 +1,77 @@
 DEBUG_OPT :=
 UNAME_M := $(shell uname -m)
+UNAME_S := $(shell uname -s)
+CC ?= gcc
 
 # Yeah, fast math.  We want it to be fast, for all xcall, 
 # IEEE compliance should not be an issue.
 OPT_LV := -O3 -ffast-math -ftree-vectorize -funroll-loops
-CFLAGS=-std=c99 -g ${OPT_LV} -Wall -Werror -I../thirdparties/install/include
-OBJS=mo.o arith.o compare.o logic.o xcall.o usearchex.o bloom.o
-CUDA_OBJS=
+COMMON_CFLAGS := -g $(OPT_LV) -Wall -Werror -fPIC -I../thirdparties/install/include
+CFLAGS := -std=c99 $(COMMON_CFLAGS)
+OBJS := mo.o arith.o compare.o logic.o xcall.o usearchex.o bloom.o
+CUDA_OBJS :=
+LDFLAGS := -L../thirdparties/install/lib -lusearch_c
+TARGET_LIB := libmo.so
+
+ifeq ($(UNAME_S),Darwin)
+	TARGET_LIB := libmo.dylib
+	LDFLAGS += -dynamiclib -undefined dynamic_lookup -install_name @rpath/$(TARGET_LIB)
+else
+	LDFLAGS += -shared
+endif
 
 ifeq ($(UNAME_M), x86_64)
-	CFLAGS+= -march=haswell
+	CFLAGS += -march=haswell
 endif
 
 ifeq ($(MO_CL_CUDA),1)
+	ifeq ($(CONDA_PREFIX),)
+		$(error CONDA_PREFIX env variable not found. Please activate your conda environment.)
+	endif
 	CC = /usr/local/cuda/bin/nvcc 
-	CFLAGS = -ccbin g++ -m64 --shared -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
+	CFLAGS = -ccbin g++ -m64 -Xcompiler -fPIC -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
 	CFLAGS += -I../thirdparties/install/include -DMO_CL_CUDA
 	CUDA_OBJS += cuda/cuda.o
-	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -lstdc++
+	# Explicitly include all needed libraries for shared library linking
+	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lstdc++
+	LDFLAGS += $(CUDA_LDFLAGS)
 endif
 
-all: libmo.a
+.PHONY: all clean test debug
+
+all: $(TARGET_LIB) libmo.a
 
-libmo.a: $(OBJS) 
+$(TARGET_LIB): $(OBJS) 
 ifeq ($(MO_CL_CUDA),1)
-	make -C cuda
+	$(MAKE) -C cuda
+	$(MAKE) -C cuvs
+	$(CC) $(LDFLAGS) -o $@ $(OBJS) $(CUDA_OBJS) cuvs/*.o
+else
+	$(CC) $(LDFLAGS) -o $@ $(OBJS)
 endif
-	ar -rcs libmo.a $(OBJS) $(CUDA_OBJS)
 
-# 
-#	$(CC) -o libmo.a $(OBJS) $(CUDA_OBJS) $(CUDA_LDFLAGS)
+libmo.a: $(OBJS)
+ifeq ($(MO_CL_CUDA),1)
+	$(MAKE) -C cuda
+	$(MAKE) -C cuvs
+	ar -rcs $@ $(OBJS) $(CUDA_OBJS) cuvs/*.o
+else
+	ar -rcs $@ $(OBJS)
+endif
 
+%.o: %.c
+	$(CC) $(CFLAGS) -c $< -o $@
 
-test: libmo.a
-	make -C test
+test: $(TARGET_LIB)
+	$(MAKE) -C test
 
-.PHONY: debug
 debug: override OPT_LV := -O0
 debug: override DEBUG_OPT := debug
 debug: all
 
-.PHONY: clean
 clean:
-	rm -f *.o *.a *.so
+	rm -f *.o *.a *.so *.dylib
 ifeq ($(MO_CL_CUDA),1)
-	make -C cuda clean
+	$(MAKE) -C cuda clean
+	$(MAKE) -C cuvs clean
 endif
diff --git a/cgo/README.md b/cgo/README.md
index 5699ca4d292a2..ffb190c652bc3 100644
--- a/cgo/README.md
+++ b/cgo/README.md
@@ -1,25 +1,28 @@
 MatrixOne CGO Kernel
 ===============================
 
-This directory contains cgo source code for MO.   Running
-make should produce two files to be used by go code.
-On go side, go will `include "mo.h"` and `-lmo`.   
+This directory contains CGO source code for MatrixOne. Running `make` produces the core library files used by Go code.
+
+On the Go side, the integration typically uses `mo.h` and links against the generated libraries:
 ```
 mo.h
-libmo.a
+libmo.a / libmo.so
 ```
 
-`mo.h` should be pristine, meaning it only contains C function
-prototype used by go.  The only datatypes that can be passed 
-between go and c code are int and float/double and pointer.   
-Always explicitly specify int size such as `int32_t`, `uint64_t`.
-Do not use `int`, `long`, etc.
+`mo.h` should remain pristine, containing only C function prototypes for Go to consume. Data passed between Go and C should be limited to standard types (int, float, double, pointers). Always specify explicit integer sizes (e.g., `int32_t`, `uint64_t`) and avoid platform-dependent types like `int` or `long`.
+
+GPU Support (CUDA & cuVS)
+-------------------------
+The kernel supports GPU acceleration for certain operations (e.g., vector search) via NVIDIA CUDA and the cuVS library.
+
+- **Build Flag:** GPU support is enabled by setting `MO_CL_CUDA=1` during the build.
+- **Environment:** Requires a working CUDA installation and a Conda environment with `cuvs` and `rmm` installed.
+- **Source Code:** GPU-specific code resides in the `cuda/` and `cuvs/` subdirectories.
 
 Implementation Notes
---------------------------------
+--------------------
 
-1. Pure C.
-2. Use memory passed from go.  Try not allocate memory in C code.
-3. Only depends on libc and libm.
-4. If 3rd party lib is absolutely necessary, import source code 
-   and build from source. If 3rd party lib is C++, wrap it completely in C.
+1. **Language:** Core kernel is Pure C. GPU extensions use C++ and CUDA, wrapped in a C-compatible interface.
+2. **Memory Management:** Prefer using memory allocated and passed from Go. Minimize internal allocations in C/C++ code.
+3. **Dependencies:** The base kernel depends only on `libc`, `libm`, and `libusearch`. GPU builds introduce dependencies on CUDA, `cuvs`, and `rmm`.
+4. **Third-party Libraries:** If a third-party library is necessary, it should be built from source (see `thirdparties/` directory). C++ libraries must be fully wrapped in C before being exposed to Go.
diff --git a/cgo/cuda/Makefile b/cgo/cuda/Makefile
index a95913b014d58..eca30f9be2b98 100644
--- a/cgo/cuda/Makefile
+++ b/cgo/cuda/Makefile
@@ -395,7 +395,7 @@ $(FATBIN_FILE): mocl.cu
 	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -fatbin $<
 
 cuda.o: cuda.cpp
-	$(EXEC) $(NVCC) $(INCLUDES) -O3 --shared $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+	$(EXEC) $(NVCC) $(INCLUDES) -O3 --shared -Xcompiler -fPIC $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
 
 mytest.o: cuda.cpp $(FATBIN_FILE)
 	$(EXEC) $(NVCC) $(INCLUDES) -DTEST_RUN -g -O0 $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
diff --git a/cgo/cuvs/Makefile b/cgo/cuvs/Makefile
new file mode 100644
index 0000000000000..99341f65f3029
--- /dev/null
+++ b/cgo/cuvs/Makefile
@@ -0,0 +1,71 @@
+# Makefile for MatrixOne cuVS C Wrapper
+
+UNAME_M := $(shell uname -m)
+CUDA_PATH ?= /usr/local/cuda
+NVCC := $(CUDA_PATH)/bin/nvcc
+
+ifeq ($(CONDA_PREFIX),)
+  $(error CONDA_PREFIX env variable not found. Please activate your conda environment.)
+endif
+
+# Compilation flags
+# Added --extended-lambda because raft/core/copy.cuh requires it for some internal headers
+NVCC_FLAGS := -std=c++17 -x cu -Xcompiler "-Wall -Wextra -fPIC -O2" --extended-lambda --expt-relaxed-constexpr
+NVCC_FLAGS += -I. -I$(CUDA_PATH)/include -I$(CONDA_PREFIX)/include -I$(CONDA_PREFIX)/include/rapids -I$(CONDA_PREFIX)/include/raft -I$(CONDA_PREFIX)/include/cuvs
+NVCC_FLAGS += -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DRAFT_SYSTEM_LITTLE_ENDIAN=1
+
+# Linking flags
+LDFLAGS := -shared
+LDFLAGS += -L$(CUDA_PATH)/lib64/stubs -lcuda -L$(CUDA_PATH)/lib64 -lcudart
+LDFLAGS += -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lrapids_logger
+LDFLAGS += -Xlinker -lpthread -Xlinker -lm
+
+# Target library
+TARGET := libmocuvs.so
+
+# Source files
+SRCS := brute_force_c.cpp ivf_flat_c.cpp cagra_c.cpp kmeans_c.cpp helper.cpp
+OBJS := $(SRCS:.cpp=.o)
+
+# Test configuration
+TESTDIR := test
+OBJDIR := obj
+TEST_EXE := test_cuvs_worker
+TEST_SRCS := $(TESTDIR)/main_test.cu \
+             $(TESTDIR)/brute_force_test.cu \
+             $(TESTDIR)/ivf_flat_test.cu \
+             $(TESTDIR)/cagra_test.cu \
+             $(TESTDIR)/kmeans_test.cu
+
+TEST_OBJS := $(patsubst $(TESTDIR)/%.cu, $(OBJDIR)/test/%.o, $(TEST_SRCS))
+
+.PHONY: all clean test
+
+all: $(OBJS)
+
+$(TARGET): $(OBJS)
+	@echo "Linking shared library $@"
+	$(NVCC) $(LDFLAGS) $^ -o $@
+
+%.o: %.cpp
+	@echo "Compiling $< with NVCC"
+	$(NVCC) $(NVCC_FLAGS) -c $< -o $@
+
+# Test targets
+test: $(TEST_EXE)
+	@echo "Running tests..."
+	./$(TEST_EXE)
+
+$(TEST_EXE): $(TEST_OBJS)
+	@echo "NVCCLD $@"
+	$(NVCC) $(subst -x cu,,$(NVCC_FLAGS)) $^ $(subst -shared,,$(LDFLAGS)) -o $@
+
+$(OBJDIR)/test/%.o: $(TESTDIR)/%.cu
+	@mkdir -p $(@D)
+	@echo "NVCC $<"
+	$(NVCC) -std=c++17 -Xcompiler "-Wall -Wextra -fPIC -O2" --extended-lambda --expt-relaxed-constexpr -I. -I$(CUDA_PATH)/include -I$(CONDA_PREFIX)/include -I$(CONDA_PREFIX)/include/rapids -I$(CONDA_PREFIX)/include/raft -I$(CONDA_PREFIX)/include/cuvs -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DRAFT_SYSTEM_LITTLE_ENDIAN=1 -c $< -o $@
+
+clean:
+	@echo "Cleaning up..."
+	rm -f $(TARGET) *.o $(TEST_EXE)
+	rm -rf $(OBJDIR)
diff --git a/cgo/cuvs/brute_force.hpp b/cgo/cuvs/brute_force.hpp
new file mode 100644
index 0000000000000..58fd5fb2cc3d5
--- /dev/null
+++ b/cgo/cuvs/brute_force.hpp
@@ -0,0 +1,245 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuvs_worker.hpp" // For cuvs_worker_t and raft_handle_wrapper_t
+#include <raft/util/cudart_utils.hpp> // For RAFT_CUDA_TRY
+#include <cuda_fp16.h> // For half
+
+// Standard library includes
+#include <algorithm>   // For std::copy
+#include <iostream>    // For simulation debug logs
+#include <memory>
+#include <numeric>     // For std::iota
+#include <stdexcept>   // For std::runtime_error
+#include <string>      
+#include <type_traits> 
+#include <vector>
+#include <future>      // For std::promise and std::future
+#include <limits>      // For std::numeric_limits
+#include <shared_mutex> // For std::shared_mutex
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+// RAFT includes
+#include <raft/core/device_mdarray.hpp> // For raft::device_matrix
+#include <raft/core/device_mdspan.hpp>   // Required for device_matrix_view
+#include <raft/core/host_mdarray.hpp> // For raft::host_matrix
+#include <raft/core/resources.hpp>       // Core resource handle
+#include <raft/linalg/map.cuh>           // RESTORED: map.cuh
+#include <raft/core/copy.cuh>            // For raft::copy with type conversion
+
+
+// cuVS includes
+#include <cuvs/distance/distance.hpp>    // cuVS distance API
+#include <cuvs/neighbors/brute_force.hpp> // Correct include
+#pragma GCC diagnostic pop
+
+
+namespace matrixone {
+
+/**
+ * @brief Brute-force nearest neighbor search on GPU.
+ * @tparam T Data type of the vector elements (e.g., float, half).
+ */
+template <typename T>
+class gpu_brute_force_t {
+public:
+    std::vector<T> flattened_host_dataset; // Host-side copy of the dataset
+    std::unique_ptr<cuvs::neighbors::brute_force::index<T, float>> index; // cuVS brute-force index
+    cuvs::distance::DistanceType metric; // Distance metric
+    uint32_t dimension; // Dimension of vectors
+    uint32_t count; // Number of vectors in the dataset
+    int device_id_; // CUDA device ID
+    std::unique_ptr<cuvs_worker_t> worker; // Asynchronous task worker
+    std::shared_mutex mutex_; // Protects index and data access
+    bool is_loaded_ = false; // Whether the index is loaded into GPU memory
+    std::shared_ptr<void> dataset_device_ptr_; // Pointer to device-side dataset memory
+
+    ~gpu_brute_force_t() {
+        destroy();
+    }
+
+    /**
+     * @brief Constructor for brute-force search.
+     * @param dataset_data Pointer to the flattened dataset on host.
+     * @param count_vectors Number of vectors.
+     * @param dimension Vector dimension.
+     * @param m Distance metric.
+     * @param nthread Number of worker threads.
+     * @param device_id GPU device ID.
+     */
+    gpu_brute_force_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, cuvs::distance::DistanceType m,
+                       uint32_t nthread, int device_id = 0)
+        : dimension(dimension), count(static_cast<uint32_t>(count_vectors)), metric(m), device_id_(device_id) {
+        worker = std::make_unique<cuvs_worker_t>(nthread, device_id_);
+
+        // Resize flattened_host_dataset and copy data from the flattened array
+        flattened_host_dataset.resize(count * dimension); // Total elements
+        if (dataset_data) {
+            std::copy(dataset_data, dataset_data + (count * dimension), flattened_host_dataset.begin());
+        }
+    }
+
+    /**
+     * @brief Loads the dataset to the GPU and builds the index.
+     */
+    void load() {
+        std::unique_lock<std::shared_mutex> lock(mutex_); // Acquire exclusive lock
+        if (is_loaded_) return;
+
+        std::promise<bool> init_complete_promise;
+        std::future<bool> init_complete_future = init_complete_promise.get_future();
+
+        auto init_fn = [&](raft_handle_wrapper_t& handle) -> std::any {
+            if (flattened_host_dataset.empty()) { // Use new member
+                index = nullptr; // Ensure index is null if no data
+                init_complete_promise.set_value(true); // Signal completion even if empty
+                return std::any();
+            }
+
+            auto dataset_device = new auto(raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                *handle.get_raft_resources(), static_cast<int64_t>(count), static_cast<int64_t>(dimension)));
+            
+            dataset_device_ptr_ = std::shared_ptr<void>(dataset_device, [](void* ptr) {
+                delete static_cast<raft::device_matrix<T, int64_t, raft::layout_c_contiguous>*>(ptr);
+            });
+
+            RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), flattened_host_dataset.data(),
+                                     flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice,
+                                     raft::resource::get_cuda_stream(*handle.get_raft_resources())));
+
+            cuvs::neighbors::brute_force::index_params index_params; // Correct brute_force namespace
+            index_params.metric = metric;
+
+            index = std::make_unique<cuvs::neighbors::brute_force::index<T, float>>(
+                cuvs::neighbors::brute_force::build(*handle.get_raft_resources(), index_params, raft::make_const_mdspan(dataset_device->view()))); // Use raft::make_const_mdspan
+
+            raft::resource::sync_stream(*handle.get_raft_resources()); // Synchronize after build
+
+            init_complete_promise.set_value(true); // Signal that initialization is complete
+            return std::any();
+        };
+        auto stop_fn = [&](raft_handle_wrapper_t& handle) -> std::any {
+            if (index) { // Check if unique_ptr holds an object
+                index.reset();
+            }
+            dataset_device_ptr_.reset();
+            return std::any();
+        };
+        worker->start(init_fn, stop_fn);
+
+        init_complete_future.get(); // Wait for the init_fn to complete
+        is_loaded_ = true;
+    }
+
+    /**
+     * @brief Search result containing neighbor IDs and distances.
+     */
+    struct search_result_t {
+        std::vector<int64_t> neighbors; // Indices of nearest neighbors
+        std::vector<float> distances;  // Distances to nearest neighbors
+    };
+
+    /**
+     * @brief Performs brute-force search for given queries.
+     * @param queries_data Pointer to flattened query vectors on host.
+     * @param num_queries Number of query vectors.
+     * @param query_dimension Dimension of query vectors.
+     * @param limit Number of nearest neighbors to find.
+     * @return Search results.
+     */
+    search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit) {
+        if (!queries_data || num_queries == 0 || dimension == 0) { // Check for invalid input
+            return search_result_t{};
+        }
+        if (query_dimension != this->dimension) {
+            throw std::runtime_error("Query dimension does not match index dimension.");
+        }
+        if (limit == 0) {
+            return search_result_t{};
+        }
+        if (!index) {
+            return search_result_t{};
+        }
+
+        size_t queries_rows = num_queries;
+        size_t queries_cols = dimension; // Use the class's dimension
+
+        uint64_t job_id = worker->submit(
+            [&, queries_rows, queries_cols, limit](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(mutex_); // Acquire shared read-only lock inside worker thread
+                
+                auto queries_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                    *handle.get_raft_resources(), static_cast<int64_t>(queries_rows), static_cast<int64_t>(queries_cols));
+                RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data,
+                                         queries_rows * queries_cols * sizeof(T), cudaMemcpyHostToDevice,
+                                         raft::resource::get_cuda_stream(*handle.get_raft_resources())));
+
+                auto neighbors_device = raft::make_device_matrix<int64_t, int64_t, raft::layout_c_contiguous>(
+                    *handle.get_raft_resources(), static_cast<int64_t>(queries_rows), static_cast<int64_t>(limit));
+                auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                    *handle.get_raft_resources(), static_cast<int64_t>(queries_rows), static_cast<int64_t>(limit));
+
+                cuvs::neighbors::brute_force::search_params search_params;
+                cuvs::neighbors::brute_force::search(*handle.get_raft_resources(), search_params, *index,
+                                                     raft::make_const_mdspan(queries_device.view()), neighbors_device.view(), distances_device.view());
+
+                search_result_t res;
+                res.neighbors.resize(queries_rows * limit);
+                res.distances.resize(queries_rows * limit);
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(res.neighbors.data(), neighbors_device.data_handle(),
+                                         res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*handle.get_raft_resources())));
+                RAFT_CUDA_TRY(cudaMemcpyAsync(res.distances.data(), distances_device.data_handle(),
+                                         res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*handle.get_raft_resources())));
+
+                raft::resource::sync_stream(*handle.get_raft_resources());
+
+                // Post-process to handle sentinels
+                for (size_t i = 0; i < res.neighbors.size(); ++i) {
+                    if (res.neighbors[i] == std::numeric_limits<int64_t>::max() || 
+                        res.neighbors[i] == 4294967295LL || 
+                        res.neighbors[i] < 0) {
+                        res.neighbors[i] = -1;
+                    }
+                }
+                
+                return res;
+            }
+        );
+
+        cuvs_task_result_t result = worker->wait(job_id).get();
+        if (result.error) {
+            std::rethrow_exception(result.error);
+        }
+
+        return std::any_cast<search_result_t>(result.result);
+    }
+
+    void destroy() {
+        if (worker) {
+            worker->stop();
+        }
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/brute_force_c.cpp b/cgo/cuvs/brute_force_c.cpp
new file mode 100644
index 0000000000000..340a255eeeb5d
--- /dev/null
+++ b/cgo/cuvs/brute_force_c.cpp
@@ -0,0 +1,145 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "brute_force_c.h"
+#include "brute_force.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <cstring>
+
+struct gpu_brute_force_any_t {
+
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_brute_force_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_brute_force_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_brute_force_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_brute_force_t<half>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_brute_force_c gpu_brute_force_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric_c, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        void* index_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                index_ptr = new matrixone::gpu_brute_force_t<float>(static_cast<const float*>(dataset_data), count_vectors, dimension, metric, nthread, device_id);
+                break;
+            case Quantization_F16:
+                index_ptr = new matrixone::gpu_brute_force_t<half>(static_cast<const half*>(dataset_data), count_vectors, dimension, metric, nthread, device_id);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for brute force (only f32 and f16 supported)");
+        }
+        return static_cast<gpu_brute_force_c>(new gpu_brute_force_any_t(qtype, index_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_new", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_brute_force_load(gpu_brute_force_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->load(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->load(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_load", e.what());
+    }
+}
+
+gpu_brute_force_search_result_c gpu_brute_force_search(gpu_brute_force_c index_c, const void* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        void* result_ptr = nullptr;
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto res = std::make_unique<matrixone::gpu_brute_force_t<float>::search_result_t>();
+                *res = static_cast<matrixone::gpu_brute_force_t<float>*>(any->ptr)->search(static_cast<const float*>(queries_data), num_queries, query_dimension, limit);
+                result_ptr = res.release();
+                break;
+            }
+            case Quantization_F16: {
+                auto res = std::make_unique<matrixone::gpu_brute_force_t<half>::search_result_t>();
+                *res = static_cast<matrixone::gpu_brute_force_t<half>*>(any->ptr)->search(static_cast<const half*>(queries_data), num_queries, query_dimension, limit);
+                result_ptr = res.release();
+                break;
+            }
+            default: break;
+        }
+        return static_cast<gpu_brute_force_search_result_c>(result_ptr);
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_search", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_brute_force_get_results(gpu_brute_force_search_result_c result_c, uint64_t num_queries, uint32_t limit, int64_t* neighbors, float* distances) {
+    if (!result_c) return;
+    auto* search_result = static_cast<matrixone::gpu_brute_force_t<float>::search_result_t*>(result_c);
+
+    size_t total = num_queries * limit;
+    if (search_result->neighbors.size() >= total) {
+        std::copy(search_result->neighbors.begin(), search_result->neighbors.begin() + total, neighbors);
+    } else {
+        std::fill(neighbors, neighbors + total, -1);
+    }
+
+    if (search_result->distances.size() >= total) {
+        std::copy(search_result->distances.begin(), search_result->distances.begin() + total, distances);
+    } else {
+        std::fill(distances, distances + total, std::numeric_limits<float>::infinity());
+    }
+}
+
+void gpu_brute_force_free_search_result(gpu_brute_force_search_result_c result_c) {
+    if (!result_c) return;
+    delete static_cast<matrixone::gpu_brute_force_t<float>::search_result_t*>(result_c);
+}
+
+void gpu_brute_force_destroy(gpu_brute_force_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_brute_force_any_t*>(index_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_brute_force_destroy", e.what());
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_brute_force_t<float>;
+template class gpu_brute_force_t<half>;
+}
diff --git a/cgo/cuvs/brute_force_c.h b/cgo/cuvs/brute_force_c.h
new file mode 100644
index 0000000000000..6042ec9608ae6
--- /dev/null
+++ b/cgo/cuvs/brute_force_c.h
@@ -0,0 +1,54 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef BRUTE_FORCE_C_H
+#define BRUTE_FORCE_C_H
+
+#include "helper.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_brute_force_t object
+typedef void* gpu_brute_force_c;
+
+// Opaque pointer to the C++ search result object
+typedef void* gpu_brute_force_search_result_c;
+
+// Constructor for gpu_brute_force_t
+gpu_brute_force_c gpu_brute_force_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, distance_type_t metric, uint32_t nthread, int device_id, quantization_t qtype, void* errmsg);
+
+// Loads the index to the GPU
+void gpu_brute_force_load(gpu_brute_force_c index_c, void* errmsg);
+
+// Performs a search operation
+gpu_brute_force_search_result_c gpu_brute_force_search(gpu_brute_force_c index_c, const void* queries_data, uint64_t num_queries, uint32_t query_dimension, uint32_t limit, void* errmsg);
+
+// Retrieves the results from a search operation
+void gpu_brute_force_get_results(gpu_brute_force_search_result_c result_c, uint64_t num_queries, uint32_t limit, int64_t* neighbors, float* distances);
+
+// Frees the memory for a gpu_brute_force_search_result_c object
+void gpu_brute_force_free_search_result(gpu_brute_force_search_result_c result_c);
+
+// Destroys the gpu_brute_force_t object and frees associated resources
+void gpu_brute_force_destroy(gpu_brute_force_c index_c, void* errmsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // BRUTE_FORCE_C_H
diff --git a/cgo/cuvs/cagra.hpp b/cgo/cuvs/cagra.hpp
new file mode 100644
index 0000000000000..62d1046f0ced4
--- /dev/null
+++ b/cgo/cuvs/cagra.hpp
@@ -0,0 +1,434 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuvs_worker.hpp" // For cuvs_worker_t and raft_handle_wrapper_t
+#include "cuvs_types.h"    // For distance_type_t, cagra_build_params_t, etc.
+#include <raft/util/cudart_utils.hpp> // For RAFT_CUDA_TRY
+#include <cuda_fp16.h> // For half
+
+// Standard library includes
+#include <algorithm>   // For std::copy
+#include <iostream>    // For simulation debug logs
+#include <memory>
+#include <numeric>     // For std::iota
+#include <stdexcept>   // For std::runtime_error
+#include <string>      
+#include <type_traits> 
+#include <vector>
+#include <future>      // For std::promise and std::future
+#include <limits>      // For std::numeric_limits
+#include <shared_mutex> // For std::shared_mutex
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+// RAFT includes
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/core/copy.cuh>            // For raft::copy with type conversion
+#include <raft/core/device_resources_snmg.hpp> // For checking SNMG type
+
+// cuVS includes
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/cagra.hpp>
+#pragma GCC diagnostic pop
+
+namespace matrixone {
+
+/**
+ * @brief gpu_cagra_t implements a CAGRA index that can run on a single GPU or sharded across multiple GPUs.
+ * It automatically chooses between single-GPU and multi-GPU (SNMG) cuVS APIs based on the RAFT handle resources.
+ */
+template <typename T>
+class gpu_cagra_t {
+public:
+    using cagra_index = cuvs::neighbors::cagra::index<T, uint32_t>;
+    using mg_index = cuvs::neighbors::mg_index<cagra_index, T, uint32_t>;
+
+    std::vector<T> flattened_host_dataset;
+    std::vector<int> devices_;
+    std::string filename_;
+    
+    // Internal index storage
+    std::unique_ptr<cagra_index> index_;
+    std::unique_ptr<mg_index> mg_index_;
+
+    cuvs::distance::DistanceType metric;
+    uint32_t dimension;
+    uint32_t count;
+    cagra_build_params_t build_params;
+    distribution_mode_t dist_mode;
+
+    std::unique_ptr<cuvs_worker_t> worker;
+    std::shared_mutex mutex_;
+    bool is_loaded_ = false;
+    std::shared_ptr<void> dataset_device_ptr_; // Keeps device dataset alive for single-GPU build
+
+    ~gpu_cagra_t() {
+        destroy();
+    }
+
+    // Unified Constructor for building from dataset
+    gpu_cagra_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                  cuvs::distance::DistanceType m, const cagra_build_params_t& bp,
+                  const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode)
+        : dimension(dimension), count(static_cast<uint32_t>(count_vectors)), metric(m), 
+          build_params(bp), dist_mode(mode), devices_(devices) {
+        
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        worker = std::make_unique<cuvs_worker_t>(nthread, devices_, force_mg || (devices_.size() > 1));
+
+        flattened_host_dataset.resize(count * dimension);
+        if (dataset_data) {
+            std::copy(dataset_data, dataset_data + (count * dimension), flattened_host_dataset.begin());
+        }
+    }
+
+    // Unified Constructor for loading from file
+    gpu_cagra_t(const std::string& filename, uint32_t dimension, cuvs::distance::DistanceType m, 
+                    const cagra_build_params_t& bp, const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode)
+        : filename_(filename), dimension(dimension), metric(m), count(0), 
+          build_params(bp), dist_mode(mode), devices_(devices) {
+        
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        worker = std::make_unique<cuvs_worker_t>(nthread, devices_, force_mg || (devices_.size() > 1));
+    }
+
+    // Private constructor for creating from an existing cuVS index (used by merge)
+    gpu_cagra_t(std::unique_ptr<cagra_index> idx, 
+                  uint32_t dim, cuvs::distance::DistanceType m, uint32_t nthread, const std::vector<int>& devices)
+        : index_(std::move(idx)), metric(m), dimension(dim), devices_(devices) {
+        
+        // Merge result is currently a single-GPU index.
+        worker = std::make_unique<cuvs_worker_t>(nthread, devices_, false);
+        worker->start();
+        count = static_cast<uint32_t>(index_->size());
+        build_params.graph_degree = static_cast<size_t>(index_->graph_degree());
+        build_params.intermediate_graph_degree = build_params.graph_degree * 2; // Best guess
+        dist_mode = DistributionMode_SINGLE_GPU;
+        is_loaded_ = true;
+    }
+
+    /**
+     * @brief Loads the index from file or builds it from the dataset.
+     */
+    void load() {
+        std::unique_lock<std::shared_mutex> lock(mutex_);
+        if (is_loaded_) return;
+
+        std::promise<bool> init_complete_promise;
+        std::future<bool> init_complete_future = init_complete_promise.get_future();
+
+        auto init_fn = [&](raft_handle_wrapper_t& handle) -> std::any {
+            auto res = handle.get_raft_resources();
+            bool is_mg = is_snmg_handle(res);
+
+            if (!filename_.empty()) {
+                if (is_mg) {
+                    mg_index_ = std::make_unique<mg_index>(
+                        cuvs::neighbors::cagra::deserialize<T, uint32_t>(*res, filename_));
+                    count = 0;
+                    for (const auto& iface : mg_index_->ann_interfaces_) {
+                        if (iface.index_.has_value()) count += static_cast<uint32_t>(iface.index_.value().size());
+                    }
+                    if (!mg_index_->ann_interfaces_.empty() && mg_index_->ann_interfaces_[0].index_.has_value()) {
+                        build_params.graph_degree = static_cast<size_t>(mg_index_->ann_interfaces_[0].index_.value().graph_degree());
+                    }
+                } else {
+                    index_ = std::make_unique<cagra_index>(*res);
+                    cuvs::neighbors::cagra::deserialize(*res, filename_, index_.get());
+                    count = static_cast<uint32_t>(index_->size());
+                    build_params.graph_degree = static_cast<size_t>(index_->graph_degree());
+                }
+                raft::resource::sync_stream(*res);
+            } else if (!flattened_host_dataset.empty()) {
+                if (is_mg) {
+                    auto dataset_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                        flattened_host_dataset.data(), (int64_t)count, (int64_t)dimension);
+
+                    cuvs::neighbors::cagra::index_params index_params;
+                    index_params.metric = metric;
+                    index_params.intermediate_graph_degree = build_params.intermediate_graph_degree;
+                    index_params.graph_degree = build_params.graph_degree;
+
+                    cuvs::neighbors::mg_index_params<cuvs::neighbors::cagra::index_params> mg_params(index_params);
+                    if (dist_mode == DistributionMode_REPLICATED) {
+                        mg_params.mode = cuvs::neighbors::distribution_mode::REPLICATED;
+                    } else {
+                        mg_params.mode = cuvs::neighbors::distribution_mode::SHARDED;
+                    }
+
+                    mg_index_ = std::make_unique<mg_index>(
+                        cuvs::neighbors::cagra::build(*res, mg_params, dataset_host_view));
+                } else {
+                    auto dataset_device = new auto(raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(count), static_cast<int64_t>(dimension)));
+                    
+                    dataset_device_ptr_ = std::shared_ptr<void>(dataset_device, [](void* ptr) {
+                        delete static_cast<raft::device_matrix<T, int64_t, raft::layout_c_contiguous>*>(ptr);
+                    });
+
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), flattened_host_dataset.data(),
+                                             flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice,
+                                             raft::resource::get_cuda_stream(*res)));
+
+                    cuvs::neighbors::cagra::index_params index_params;
+                    index_params.metric = metric;
+                    index_params.intermediate_graph_degree = build_params.intermediate_graph_degree;
+                    index_params.graph_degree = build_params.graph_degree;
+                    index_params.attach_dataset_on_build = build_params.attach_dataset_on_build;
+
+                    index_ = std::make_unique<cagra_index>(
+                        cuvs::neighbors::cagra::build(*res, index_params, raft::make_const_mdspan(dataset_device->view())));
+                }
+                raft::resource::sync_stream(*res);
+            }
+
+            init_complete_promise.set_value(true);
+            return std::any();
+        };
+
+        auto stop_fn = [&](raft_handle_wrapper_t& handle) -> std::any {
+            index_.reset();
+            mg_index_.reset();
+            dataset_device_ptr_.reset();
+            return std::any();
+        };
+
+        worker->start(init_fn, stop_fn);
+        init_complete_future.get();
+        is_loaded_ = true;
+    }
+
+    /**
+     * @brief Extends the existing index with additional vectors.
+     * @param additional_data Pointer to additional vectors on host.
+     * @param num_vectors Number of vectors to add.
+     */
+    void extend(const T* additional_data, uint64_t num_vectors) {
+        if constexpr (std::is_same_v<T, half>) {
+             throw std::runtime_error("CAGRA single-GPU extend is not supported for float16 (half) by cuVS.");
+        } else {
+            if (!is_loaded_ || !index_) {
+                throw std::runtime_error("index must be loaded before extending (or it is a multi-GPU index, which doesn't support extend).");
+            }
+            if (num_vectors == 0) return;
+
+            std::unique_lock<std::shared_mutex> lock(mutex_);
+
+            uint64_t job_id = worker->submit(
+                [&, additional_data, num_vectors](raft_handle_wrapper_t& handle) -> std::any {
+                    auto res = handle.get_raft_resources();
+                    
+                    auto additional_dataset_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(num_vectors), static_cast<int64_t>(dimension));
+                    
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(additional_dataset_device.data_handle(), additional_data,
+                                            num_vectors * dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                            raft::resource::get_cuda_stream(*res)));
+
+                    cuvs::neighbors::cagra::extend_params params;
+                    cuvs::neighbors::cagra::extend(*res, params, raft::make_const_mdspan(additional_dataset_device.view()), *index_);
+
+                    raft::resource::sync_stream(*res);
+                    return std::any();
+                }
+            );
+
+            cuvs_task_result_t result = worker->wait(job_id).get();
+            if (result.error) std::rethrow_exception(result.error);
+
+            count += static_cast<uint32_t>(num_vectors);
+            if (!flattened_host_dataset.empty()) {
+                size_t old_size = flattened_host_dataset.size();
+                flattened_host_dataset.resize(old_size + num_vectors * dimension);
+                std::copy(additional_data, additional_data + num_vectors * dimension, flattened_host_dataset.begin() + old_size);
+            }
+        }
+    }
+
+    /**
+     * @brief Merges multiple single-GPU CAGRA indices into one.
+     * @param indices List of pointers to CAGRA indices.
+     * @param nthread Number of worker threads for the merged index.
+     * @param devices GPU devices to use for the merged index.
+     * @return A new merged CAGRA index.
+     */
+    static std::unique_ptr<gpu_cagra_t<T>> merge(const std::vector<gpu_cagra_t<T>*>& indices, uint32_t nthread, const std::vector<int>& devices) {
+        if (indices.empty()) return nullptr;
+        
+        uint32_t dim = indices[0]->dimension;
+        cuvs::distance::DistanceType m = indices[0]->metric;
+
+        cuvs_worker_t transient_worker(1, devices, false);
+        transient_worker.start();
+
+        uint64_t job_id = transient_worker.submit(
+            [&indices](raft_handle_wrapper_t& handle) -> std::any {
+                auto res = handle.get_raft_resources();
+                
+                std::vector<cagra_index*> cagra_indices;
+                for (auto* idx : indices) {
+                    if (!idx->is_loaded_ || !idx->index_) {
+                        throw std::runtime_error("One of the indices to merge is not loaded or is a multi-GPU index (merge only supports single-GPU indices).");
+                    }
+                    cagra_indices.push_back(idx->index_.get());
+                }
+
+                cuvs::neighbors::cagra::index_params index_params;
+                
+                auto merged_index = std::make_unique<cagra_index>(
+                    cuvs::neighbors::cagra::merge(*res, index_params, cagra_indices)
+                );
+
+                raft::resource::sync_stream(*res);
+                return merged_index.release(); 
+            }
+        );
+
+        cuvs_task_result_t result = transient_worker.wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+
+        auto* merged_index_raw = std::any_cast<cagra_index*>(result.result);
+        auto merged_index_ptr = std::unique_ptr<cagra_index>(merged_index_raw);
+        transient_worker.stop();
+
+        return std::make_unique<gpu_cagra_t<T>>(std::move(merged_index_ptr), dim, m, nthread, devices);
+    }
+
+    /**
+     * @brief Serializes the index to a file.
+     * @param filename Path to the output file.
+     */
+    void save(const std::string& filename) {
+        if (!is_loaded_ || (!index_ && !mg_index_)) throw std::runtime_error("index not loaded");
+
+        uint64_t job_id = worker->submit(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(mutex_);
+                auto res = handle.get_raft_resources();
+                if (is_snmg_handle(res)) {
+                    cuvs::neighbors::cagra::serialize(*res, *mg_index_, filename);
+                } else {
+                    cuvs::neighbors::cagra::serialize(*res, filename, *index_);
+                }
+                raft::resource::sync_stream(*res);
+                return std::any();
+            }
+        );
+
+        cuvs_task_result_t result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+    }
+
+    /**
+     * @brief Search result containing neighbor IDs and distances.
+     */
+    struct search_result_t {
+        std::vector<uint32_t> neighbors; // Indices of nearest neighbors
+        std::vector<float> distances;   // Distances to nearest neighbors
+    };
+
+    /**
+     * @brief Performs CAGRA search for given queries.
+     * @param queries_data Pointer to flattened query vectors on host.
+     * @param num_queries Number of query vectors.
+     * @param query_dimension Dimension of query vectors.
+     * @param limit Number of nearest neighbors to find.
+     * @param sp CAGRA search parameters.
+     * @return Search results.
+     */
+    search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const cagra_search_params_t& sp) {
+        if (!queries_data || num_queries == 0 || dimension == 0) return search_result_t{};
+        if (query_dimension != dimension) throw std::runtime_error("dimension mismatch");
+        if (!is_loaded_ || (!index_ && !mg_index_)) return search_result_t{};
+
+        uint64_t job_id = worker->submit(
+            [&, num_queries, limit, sp](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(mutex_);
+                auto res = handle.get_raft_resources();
+
+                search_result_t search_res;
+                search_res.neighbors.resize(num_queries * limit);
+                search_res.distances.resize(num_queries * limit);
+
+                cuvs::neighbors::cagra::search_params search_params;
+                search_params.itopk_size = sp.itopk_size;
+                search_params.search_width = sp.search_width;
+
+                if (is_snmg_handle(res)) {
+                    auto queries_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                        queries_data, (int64_t)num_queries, (int64_t)dimension);
+                    auto neighbors_host_view = raft::make_host_matrix_view<uint32_t, int64_t>(
+                        search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit);
+                    auto distances_host_view = raft::make_host_matrix_view<float, int64_t>(
+                        search_res.distances.data(), (int64_t)num_queries, (int64_t)limit);
+
+                    cuvs::neighbors::mg_search_params<cuvs::neighbors::cagra::search_params> mg_search_params(search_params);
+                    cuvs::neighbors::cagra::search(*res, *mg_index_, mg_search_params,
+                                                       queries_host_view, neighbors_host_view, distances_host_view);
+                } else {
+                    auto queries_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(dimension));
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data,
+                                             num_queries * dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                             raft::resource::get_cuda_stream(*res)));
+
+                    auto neighbors_device = raft::make_device_matrix<uint32_t, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+                    auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+                    cuvs::neighbors::cagra::search(*res, search_params, *index_,
+                                                   raft::make_const_mdspan(queries_device.view()), 
+                                                   neighbors_device.view(), distances_device.view());
+
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(),
+                                             search_res.neighbors.size() * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                             raft::resource::get_cuda_stream(*res)));
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(),
+                                             search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                             raft::resource::get_cuda_stream(*res)));
+                }
+
+                raft::resource::sync_stream(*res);
+
+                for (size_t i = 0; i < search_res.neighbors.size(); ++i) {
+                    if (search_res.neighbors[i] == std::numeric_limits<uint32_t>::max()) {
+                        search_res.neighbors[i] = static_cast<uint32_t>(-1); 
+                    }
+                }
+                return search_res;
+            }
+        );
+
+        cuvs_task_result_t result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<search_result_t>(result.result);
+    }
+
+    void destroy() {
+        if (worker) worker->stop();
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/cagra_c.cpp b/cgo/cuvs/cagra_c.cpp
new file mode 100644
index 0000000000000..97faac931d9f2
--- /dev/null
+++ b/cgo/cuvs/cagra_c.cpp
@@ -0,0 +1,271 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cagra_c.h"
+#include "cagra.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+
+struct gpu_cagra_any_t {
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_cagra_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_cagra_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_cagra_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_cagra_t<half>*>(ptr); break;
+            case Quantization_INT8: delete static_cast<matrixone::gpu_cagra_t<int8_t>*>(ptr); break;
+            case Quantization_UINT8: delete static_cast<matrixone::gpu_cagra_t<uint8_t>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_cagra_c gpu_cagra_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                            distance_type_t metric_c, cagra_build_params_t build_params,
+                            const int* devices, int device_count, uint32_t nthread, 
+                            distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* cagra_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                cagra_ptr = new matrixone::gpu_cagra_t<float>(static_cast<const float*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                cagra_ptr = new matrixone::gpu_cagra_t<half>(static_cast<const half*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<int8_t>(static_cast<const int8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<uint8_t>(static_cast<const uint8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for CAGRA");
+        }
+        return static_cast<gpu_cagra_c>(new gpu_cagra_any_t(qtype, cagra_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_new", e.what());
+        return nullptr;
+    }
+}
+
+gpu_cagra_c gpu_cagra_load_file(const char* filename, uint32_t dimension, distance_type_t metric_c,
+                                 cagra_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* cagra_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                cagra_ptr = new matrixone::gpu_cagra_t<float>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                cagra_ptr = new matrixone::gpu_cagra_t<half>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<int8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                cagra_ptr = new matrixone::gpu_cagra_t<uint8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for CAGRA");
+        }
+        return static_cast<gpu_cagra_c>(new gpu_cagra_any_t(qtype, cagra_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_load_file", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_cagra_destroy(gpu_cagra_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_destroy", e.what());
+    }
+}
+
+void gpu_cagra_load(gpu_cagra_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->load(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->load(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->load(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->load(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_load", e.what());
+    }
+}
+
+void gpu_cagra_save(gpu_cagra_c index_c, const char* filename, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->save(filename); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->save(filename); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->save(filename); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->save(filename); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_save", e.what());
+    }
+}
+
+gpu_cagra_search_res_t gpu_cagra_search(gpu_cagra_c index_c, const void* queries_data, uint64_t num_queries, 
+                                         uint32_t query_dimension, uint32_t limit, 
+                                         cagra_search_params_t search_params, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_cagra_search_res_t res = {nullptr};
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto* cpp_res = new matrixone::gpu_cagra_t<float>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->search(static_cast<const float*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_cagra_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_F16: {
+                auto* cpp_res = new matrixone::gpu_cagra_t<half>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->search(static_cast<const half*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_cagra_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_INT8: {
+                auto* cpp_res = new matrixone::gpu_cagra_t<int8_t>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->search(static_cast<const int8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_cagra_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_UINT8: {
+                auto* cpp_res = new matrixone::gpu_cagra_t<uint8_t>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->search(static_cast<const uint8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_cagra_result_c>(cpp_res);
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_search", e.what());
+    }
+    return res;
+}
+
+void gpu_cagra_get_neighbors(gpu_cagra_result_c result_c, uint64_t total_elements, uint32_t* neighbors) {
+    if (!result_c) return;
+    // Using float's search_result_t is safe as neighbors is always uint32_t
+    auto* neighbors_vec = &static_cast<matrixone::gpu_cagra_t<float>::search_result_t*>(result_c)->neighbors;
+    if (neighbors_vec->size() >= total_elements) {
+        std::copy(neighbors_vec->begin(), neighbors_vec->begin() + total_elements, neighbors);
+    }
+}
+
+void gpu_cagra_get_distances(gpu_cagra_result_c result_c, uint64_t total_elements, float* distances) {
+    if (!result_c) return;
+    // Using float's search_result_t is safe as distances is always float
+    auto* distances_vec = &static_cast<matrixone::gpu_cagra_t<float>::search_result_t*>(result_c)->distances;
+    if (distances_vec->size() >= total_elements) {
+        std::copy(distances_vec->begin(), distances_vec->begin() + total_elements, distances);
+    }
+}
+
+void gpu_cagra_free_result(gpu_cagra_result_c result_c) {
+    if (!result_c) return;
+    delete static_cast<matrixone::gpu_cagra_t<float>::search_result_t*>(result_c);
+}
+
+void gpu_cagra_extend(gpu_cagra_c index_c, const void* additional_data, uint64_t num_vectors, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_cagra_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_cagra_t<float>*>(any->ptr)->extend(static_cast<const float*>(additional_data), num_vectors); break;
+            case Quantization_F16: static_cast<matrixone::gpu_cagra_t<half>*>(any->ptr)->extend(static_cast<const half*>(additional_data), num_vectors); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_cagra_t<int8_t>*>(any->ptr)->extend(static_cast<const int8_t*>(additional_data), num_vectors); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_cagra_t<uint8_t>*>(any->ptr)->extend(static_cast<const uint8_t*>(additional_data), num_vectors); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_extend", e.what());
+    }
+}
+
+gpu_cagra_c gpu_cagra_merge(gpu_cagra_c* indices_c, int num_indices, uint32_t nthread, const int* devices, int device_count, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        if (num_indices == 0) return nullptr;
+        std::vector<int> devs(devices, devices + device_count);
+        auto* first_any = static_cast<gpu_cagra_any_t*>(indices_c[0]);
+        quantization_t qtype = first_any->qtype;
+
+        void* merged_ptr = nullptr;
+        if (qtype == Quantization_F32) {
+            std::vector<matrixone::gpu_cagra_t<float>*> cpp_indices;
+            for (int i = 0; i < num_indices; ++i) cpp_indices.push_back(static_cast<matrixone::gpu_cagra_t<float>*>(static_cast<gpu_cagra_any_t*>(indices_c[i])->ptr));
+            merged_ptr = matrixone::gpu_cagra_t<float>::merge(cpp_indices, nthread, devs).release();
+        } else if (qtype == Quantization_F16) {
+            std::vector<matrixone::gpu_cagra_t<half>*> cpp_indices;
+            for (int i = 0; i < num_indices; ++i) cpp_indices.push_back(static_cast<matrixone::gpu_cagra_t<half>*>(static_cast<gpu_cagra_any_t*>(indices_c[i])->ptr));
+            merged_ptr = matrixone::gpu_cagra_t<half>::merge(cpp_indices, nthread, devs).release();
+        } else if (qtype == Quantization_INT8) {
+            std::vector<matrixone::gpu_cagra_t<int8_t>*> cpp_indices;
+            for (int i = 0; i < num_indices; ++i) cpp_indices.push_back(static_cast<matrixone::gpu_cagra_t<int8_t>*>(static_cast<gpu_cagra_any_t*>(indices_c[i])->ptr));
+            merged_ptr = matrixone::gpu_cagra_t<int8_t>::merge(cpp_indices, nthread, devs).release();
+        } else if (qtype == Quantization_UINT8) {
+            std::vector<matrixone::gpu_cagra_t<uint8_t>*> cpp_indices;
+            for (int i = 0; i < num_indices; ++i) cpp_indices.push_back(static_cast<matrixone::gpu_cagra_t<uint8_t>*>(static_cast<gpu_cagra_any_t*>(indices_c[i])->ptr));
+            merged_ptr = matrixone::gpu_cagra_t<uint8_t>::merge(cpp_indices, nthread, devs).release();
+        } else {
+            throw std::runtime_error("Unsupported quantization type for merge");
+        }
+        return static_cast<gpu_cagra_c>(new gpu_cagra_any_t(qtype, merged_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_cagra_merge", e.what());
+        return nullptr;
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_cagra_t<float>;
+template class gpu_cagra_t<half>;
+template class gpu_cagra_t<int8_t>;
+template class gpu_cagra_t<uint8_t>;
+}
diff --git a/cgo/cuvs/cagra_c.h b/cgo/cuvs/cagra_c.h
new file mode 100644
index 0000000000000..3670765b0d5ec
--- /dev/null
+++ b/cgo/cuvs/cagra_c.h
@@ -0,0 +1,80 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CAGRA_C_H
+#define CAGRA_C_H
+
+#include "helper.h"
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_cagra_t object
+typedef void* gpu_cagra_c;
+
+// Opaque pointer to the C++ CAGRA search result object
+typedef void* gpu_cagra_result_c;
+
+// Constructor for building from dataset
+gpu_cagra_c gpu_cagra_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                            distance_type_t metric, cagra_build_params_t build_params,
+                            const int* devices, int device_count, uint32_t nthread, 
+                            distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Constructor for loading from file
+gpu_cagra_c gpu_cagra_load_file(const char* filename, uint32_t dimension, distance_type_t metric,
+                                 cagra_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Destructor
+void gpu_cagra_destroy(gpu_cagra_c index_c, void* errmsg);
+
+// Load function (actually triggers the build/load logic)
+void gpu_cagra_load(gpu_cagra_c index_c, void* errmsg);
+
+// Save function
+void gpu_cagra_save(gpu_cagra_c index_c, const char* filename, void* errmsg);
+
+// Search function
+typedef struct {
+    gpu_cagra_result_c result_ptr;
+} gpu_cagra_search_res_t;
+
+gpu_cagra_search_res_t gpu_cagra_search(gpu_cagra_c index_c, const void* queries_data, uint64_t num_queries, 
+                                         uint32_t query_dimension, uint32_t limit, 
+                                         cagra_search_params_t search_params, void* errmsg);
+
+// Get results from result object
+void gpu_cagra_get_neighbors(gpu_cagra_result_c result_c, uint64_t total_elements, uint32_t* neighbors);
+void gpu_cagra_get_distances(gpu_cagra_result_c result_c, uint64_t total_elements, float* distances);
+
+// Free result object
+void gpu_cagra_free_result(gpu_cagra_result_c result_c);
+
+// Extend function
+void gpu_cagra_extend(gpu_cagra_c index_c, const void* additional_data, uint64_t num_vectors, void* errmsg);
+
+// Merge function
+gpu_cagra_c gpu_cagra_merge(gpu_cagra_c* indices_c, int num_indices, uint32_t nthread, const int* devices, int device_count, void* errmsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CAGRA_C_H
diff --git a/cgo/cuvs/cuvs_types.h b/cgo/cuvs/cuvs_types.h
new file mode 100644
index 0000000000000..95ce18024fff7
--- /dev/null
+++ b/cgo/cuvs/cuvs_types.h
@@ -0,0 +1,135 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MO_CUVS_TYPES_H
+#define MO_CUVS_TYPES_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Distance metrics supported by cuVS.
+ */
+typedef enum {
+    DistanceType_L2Expanded = 0,        // Squared L2 distance: sum((x-y)^2)
+    DistanceType_L2SqrtExpanded = 1,    // L2 distance: sqrt(sum((x-y)^2))
+    DistanceType_CosineExpanded = 2,    // Cosine distance: 1 - (x.y)/(|x||y|)
+    DistanceType_L1 = 3,                // L1 (Manhattan) distance: sum(|x-y|)
+    DistanceType_L2Unexpanded = 4,      // L2 distance without expansion
+    DistanceType_L2SqrtUnexpanded = 5,  // L2 distance with sqrt without expansion
+    DistanceType_InnerProduct = 6,      // Inner product: x.y
+    DistanceType_Linf = 7,              // Chebyshev distance: max(|x-y|)
+    DistanceType_Canberra = 8,          // Canberra distance
+    DistanceType_LpUnexpanded = 9,      // Lp distance
+    DistanceType_CorrelationExpanded = 10, // Correlation distance
+    DistanceType_JaccardExpanded = 11,  // Jaccard distance
+    DistanceType_HellingerExpanded = 12, // Hellinger distance
+    DistanceType_Haversine = 13,        // Haversine distance
+    DistanceType_BrayCurtis = 14,       // Bray-Curtis distance
+    DistanceType_JensenShannon = 15,    // Jensen-Shannon distance
+    DistanceType_HammingUnexpanded = 16, // Hamming distance
+    DistanceType_KLDivergence = 17,     // Kullback-Leibler divergence
+    DistanceType_RusselRaoExpanded = 18, // Russel-Rao distance
+    DistanceType_DiceExpanded = 19,     // Dice distance
+    DistanceType_BitwiseHamming = 20,   // Bitwise Hamming distance
+    DistanceType_Precomputed = 100,     // Precomputed distance
+    // Aliases
+    DistanceType_CosineSimilarity = 2,  // Alias for Cosine distance
+    DistanceType_Jaccard = 11,           // Alias for Jaccard distance
+    DistanceType_Hamming = 16,           // Alias for Hamming distance
+    DistanceType_Unknown = 255          // Unknown distance type
+} distance_type_t;
+
+/**
+ * @brief Data quantization types.
+ */
+typedef enum {
+    Quantization_F32,   // 32-bit floating point
+    Quantization_F16,   // 16-bit floating point (half)
+    Quantization_INT8,  // 8-bit signed integer
+    Quantization_UINT8  // 8-bit unsigned integer
+} quantization_t;
+
+/**
+ * @brief GPU distribution modes.
+ */
+typedef enum {
+    DistributionMode_SINGLE_GPU, // Single GPU mode
+    DistributionMode_SHARDED,    // Sharded across multiple GPUs
+    DistributionMode_REPLICATED  // Replicated across multiple GPUs
+} distribution_mode_t;
+
+/**
+ * @brief CAGRA index build parameters.
+ */
+typedef struct {
+    size_t intermediate_graph_degree; // Degree of the intermediate graph (default 128)
+    size_t graph_degree;              // Degree of the final graph (default 64)
+    bool attach_dataset_on_build;     // Whether to attach the dataset to the index (default true)
+} cagra_build_params_t;
+
+/**
+ * @brief CAGRA search parameters.
+ */
+typedef struct {
+    size_t itopk_size;   // Internal top-k size (default 64)
+    size_t search_width; // Number of search paths (default 1)
+} cagra_search_params_t;
+
+/**
+ * @brief IVF-Flat index build parameters.
+ */
+typedef struct {
+    uint32_t n_lists;             // Number of inverted lists (clusters) (default 1024)
+    bool add_data_on_build;       // Whether to add data to the index during build (default true)
+    double kmeans_trainset_fraction; // Fraction of data to use for k-means training (default 0.5)
+} ivf_flat_build_params_t;
+
+/**
+ * @brief IVF-Flat search parameters.
+ */
+typedef struct {
+    uint32_t n_probes; // Number of lists to probe during search (default 20)
+} ivf_flat_search_params_t;
+
+#ifdef __cplusplus
+static inline cagra_build_params_t cagra_build_params_default() {
+    return {128, 64, true};
+}
+
+static inline cagra_search_params_t cagra_search_params_default() {
+    return {64, 1};
+}
+
+static inline ivf_flat_build_params_t ivf_flat_build_params_default() {
+    return {1024, true, 0.5};
+}
+
+static inline ivf_flat_search_params_t ivf_flat_search_params_default() {
+    return {20};
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MO_CUVS_TYPES_H
diff --git a/cgo/cuvs/cuvs_worker.hpp b/cgo/cuvs/cuvs_worker.hpp
new file mode 100644
index 0000000000000..27a149c5bf60e
--- /dev/null
+++ b/cgo/cuvs/cuvs_worker.hpp
@@ -0,0 +1,367 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <any>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <functional>
+#include <future>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+#ifdef __linux__
+#include <pthread.h>
+#endif
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include <raft/core/resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/device_resources_snmg.hpp>
+#pragma GCC diagnostic pop
+
+namespace matrixone {
+
+/**
+ * @brief Wrapper for RAFT resources to manage their lifecycle.
+ * Supports both single-GPU and single-node multi-GPU (SNMG) modes.
+ */
+class raft_handle_wrapper_t {
+public:
+    // Default constructor for single-GPU mode (uses current device)
+    raft_handle_wrapper_t() : resources_(std::make_unique<raft::device_resources>()) {}
+
+    // Constructor for single-GPU mode with a specific device ID
+    explicit raft_handle_wrapper_t(int device_id) {
+        RAFT_CUDA_TRY(cudaSetDevice(device_id));
+        resources_ = std::make_unique<raft::device_resources>();
+    }
+
+    // Constructor for multi-GPU mode (SNMG)
+    // force_mg: If true, use device_resources_snmg even if devices.size() == 1 (useful for testing)
+    explicit raft_handle_wrapper_t(const std::vector<int>& devices, bool force_mg = false) {
+        if (devices.empty()) {
+            resources_ = std::make_unique<raft::device_resources>();
+        } else if (devices.size() == 1 && !force_mg) {
+            RAFT_CUDA_TRY(cudaSetDevice(devices[0]));
+            resources_ = std::make_unique<raft::device_resources>();
+        } else {
+            // Ensure the main device is set before creating SNMG resources
+            RAFT_CUDA_TRY(cudaSetDevice(devices[0]));
+            resources_ = std::make_unique<raft::device_resources_snmg>(devices);
+        }
+    }
+
+    ~raft_handle_wrapper_t() = default;
+
+    raft::resources* get_raft_resources() const { return resources_.get(); }
+
+private:
+    std::unique_ptr<raft::resources> resources_;
+};
+
+/**
+ * @brief Helper to check if a RAFT handle is configured for Multi-GPU (SNMG).
+ */
+static inline bool is_snmg_handle(raft::resources* res) {
+    return dynamic_cast<const raft::device_resources_snmg*>(res) != nullptr;
+}
+
+/**
+ * @brief A thread-safe blocking queue for task distribution.
+ */
+template <typename T>
+class thread_safe_queue_t {
+public:
+    void push(T value) {
+        {
+            std::lock_guard<std::mutex> lock(mu_);
+            queue_.push_back(std::move(value));
+        }
+        cv_.notify_one();
+    }
+
+    bool pop(T& value) {
+        std::unique_lock<std::mutex> lock(mu_);
+        cv_.wait(lock, [this] { return !queue_.empty() || stopped_; });
+        if (queue_.empty()) return false;
+        value = std::move(queue_.front());
+        queue_.pop_front();
+        return true;
+    }
+
+    void stop() {
+        {
+            std::lock_guard<std::mutex> lock(mu_);
+            stopped_ = true;
+        }
+        cv_.notify_all();
+    }
+
+    bool is_stopped() const {
+        std::lock_guard<std::mutex> lock(mu_);
+        return stopped_;
+    }
+
+private:
+    std::deque<T> queue_;
+    mutable std::mutex mu_;
+    std::condition_variable cv_;
+    bool stopped_ = false;
+};
+
+struct cuvs_task_result_t {
+    uint64_t id;
+    std::any result;
+    std::exception_ptr error;
+};
+
+/**
+ * @brief Manages storage and retrieval of task results.
+ */
+class cuvs_task_result_store_t {
+public:
+    cuvs_task_result_store_t() : next_id_(1), stopped_(false) {}
+
+    uint64_t get_next_job_id() { return next_id_.fetch_add(1); }
+
+    void store(const cuvs_task_result_t& result) {
+        std::unique_lock<std::mutex> lock(mu_);
+        if (auto it = pending_.find(result.id); it != pending_.end()) {
+            auto promise = std::move(it->second);
+            pending_.erase(it);
+            lock.unlock();
+            promise->set_value(result);
+        } else {
+            results_[result.id] = result;
+        }
+    }
+
+    std::future<cuvs_task_result_t> wait(uint64_t job_id) {
+        std::unique_lock<std::mutex> lock(mu_);
+        if (stopped_) {
+            std::promise<cuvs_task_result_t> p;
+            p.set_exception(std::make_exception_ptr(std::runtime_error("cuvs_task_result_store_t stopped before result was available")));
+            return p.get_future();
+        }
+
+        if (auto it = results_.find(job_id); it != results_.end()) {
+            std::promise<cuvs_task_result_t> p;
+            p.set_value(std::move(it->second));
+            results_.erase(it);
+            return p.get_future();
+        }
+
+        auto promise = std::make_shared<std::promise<cuvs_task_result_t>>();
+        pending_[job_id] = promise;
+        return promise->get_future();
+    }
+
+    void stop() {
+        std::lock_guard<std::mutex> lock(mu_);
+        stopped_ = true;
+        for (auto& pair : pending_) {
+            pair.second->set_exception(std::make_exception_ptr(std::runtime_error("cuvs_task_result_store_t stopped before result was available")));
+        }
+        pending_.clear();
+        results_.clear();
+    }
+
+private:
+    std::atomic<uint64_t> next_id_;
+    std::mutex mu_;
+    std::map<uint64_t, std::shared_ptr<std::promise<cuvs_task_result_t>>> pending_;
+    std::map<uint64_t, cuvs_task_result_t> results_;
+    bool stopped_;
+};
+
+/**
+ * @brief dedicated worker pool for executing cuVS (RAFT) tasks in GPU-enabled threads.
+ */
+class cuvs_worker_t {
+public:
+    using raft_handle = raft_handle_wrapper_t;
+    using user_task_fn = std::function<std::any(raft_handle&)>;
+
+    struct cuvs_task_t {
+        uint64_t id;
+        user_task_fn fn;
+    };
+
+    explicit cuvs_worker_t(size_t n_threads, int device_id = -1) 
+        : n_threads_(n_threads), device_id_(device_id) {
+        if (n_threads == 0) throw std::invalid_argument("Thread count must be > 0");
+    }
+
+    cuvs_worker_t(size_t n_threads, const std::vector<int>& devices, bool force_mg = false)
+        : n_threads_(n_threads), devices_(devices), force_mg_(force_mg) {
+        if (n_threads == 0) throw std::invalid_argument("Thread count must be > 0");
+    }
+
+    ~cuvs_worker_t() { stop(); }
+
+    cuvs_worker_t(const cuvs_worker_t&) = delete;
+    cuvs_worker_t& operator=(const cuvs_worker_t&) = delete;
+
+    void start(user_task_fn init_fn = nullptr, user_task_fn stop_fn = nullptr) {
+        if (started_.exchange(true)) return;
+        main_thread_ = std::thread(&cuvs_worker_t::run_main_loop, this, std::move(init_fn), std::move(stop_fn));
+    }
+
+    void stop() {
+        if (!started_.load() || stopped_.exchange(true)) return;
+
+        tasks_.stop();
+        {
+            std::lock_guard<std::mutex> lock(event_mu_);
+            should_stop_ = true;
+        }
+        event_cv_.notify_all();
+
+        if (main_thread_.joinable()) main_thread_.join();
+        for (auto& t : sub_workers_) if (t.joinable()) t.join();
+        
+        sub_workers_.clear();
+        result_store_.stop();
+    }
+
+    uint64_t submit(user_task_fn fn) {
+        if (stopped_.load()) throw std::runtime_error("Cannot submit task: worker stopped");
+        uint64_t id = result_store_.get_next_job_id();
+        tasks_.push({id, std::move(fn)});
+        return id;
+    }
+
+    std::future<cuvs_task_result_t> wait(uint64_t id) { return result_store_.wait(id); }
+
+    std::exception_ptr get_first_error() {
+        std::lock_guard<std::mutex> lock(event_mu_);
+        return fatal_error_;
+    }
+
+private:
+    void run_main_loop(user_task_fn init_fn, user_task_fn stop_fn) {
+        pin_thread(0);
+        auto resource = setup_resource();
+        if (!resource) return;
+
+        if (init_fn) {
+            try { init_fn(*resource); }
+            catch (...) { report_fatal_error(std::current_exception()); return; }
+        }
+
+        // Defer stop_fn cleanup
+        auto defer_cleanup = [&]() { if (stop_fn) try { stop_fn(*resource); } catch (...) {} };
+        std::shared_ptr<void> cleanup_guard(nullptr, [&](...) { defer_cleanup(); });
+
+        if (n_threads_ == 1) {
+            cuvs_task_t task;
+            while (tasks_.pop(task)) execute_task(task, *resource);
+        } else {
+            for (size_t i = 0; i < n_threads_; ++i) {
+                sub_workers_.emplace_back(&cuvs_worker_t::worker_sub_loop, this);
+            }
+            std::unique_lock<std::mutex> lock(event_mu_);
+            event_cv_.wait(lock, [this] { return should_stop_ || fatal_error_; });
+        }
+    }
+
+    void worker_sub_loop() {
+        pin_thread(-1);
+        auto resource = setup_resource();
+        if (!resource) return;
+
+        cuvs_task_t task;
+        while (tasks_.pop(task)) execute_task(task, *resource);
+    }
+
+    void execute_task(const cuvs_task_t& task, raft_handle& resource) {
+        cuvs_task_result_t res{task.id};
+        try { res.result = task.fn(resource); }
+        catch (...) { 
+            res.error = std::current_exception(); 
+            std::cerr << "ERROR: Task " << task.id << " failed." << std::endl;
+        }
+        result_store_.store(res);
+    }
+
+    std::unique_ptr<raft_handle> setup_resource() {
+        try {
+            if (!devices_.empty()) {
+                return std::make_unique<raft_handle>(devices_, force_mg_);
+            } else if (device_id_ >= 0) {
+                return std::make_unique<raft_handle>(device_id_);
+            } else {
+                return std::make_unique<raft_handle>();
+            }
+        } catch (...) {
+            report_fatal_error(std::current_exception());
+            std::cerr << "ERROR: Failed to setup RAFT resource." << std::endl;
+            return nullptr;
+        }
+    }
+
+    void report_fatal_error(std::exception_ptr err) {
+        std::lock_guard<std::mutex> lock(event_mu_);
+        if (!fatal_error_) fatal_error_ = err;
+        should_stop_ = true;
+        event_cv_.notify_all();
+    }
+
+    void pin_thread(int cpu_id) {
+#ifdef __linux__
+        static std::atomic<int> next_cpu_id{1};
+        int id = (cpu_id >= 0) ? cpu_id : (next_cpu_id.fetch_add(1) % std::thread::hardware_concurrency());
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(id, &cpuset);
+        if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset) != 0) {
+            std::cerr << "WARNING: Failed to set affinity for thread to core " << id << std::endl;
+        }
+#endif
+    }
+
+    size_t n_threads_;
+    int device_id_ = -1;
+    std::vector<int> devices_;
+    bool force_mg_ = false;
+    std::atomic<bool> started_{false};
+    std::atomic<bool> stopped_{false};
+    thread_safe_queue_t<cuvs_task_t> tasks_;
+    cuvs_task_result_store_t result_store_;
+    std::thread main_thread_;
+    std::vector<std::thread> sub_workers_;
+
+    std::mutex event_mu_;
+    std::condition_variable event_cv_;
+    bool should_stop_ = false;
+    std::exception_ptr fatal_error_;
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/helper.cpp b/cgo/cuvs/helper.cpp
new file mode 100644
index 0000000000000..32f1ea5c7730a
--- /dev/null
+++ b/cgo/cuvs/helper.cpp
@@ -0,0 +1,153 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helper.h"
+#include "cuvs_worker.hpp"
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <stdexcept>
+#include <string>
+#include <cstring>
+#include <iostream>
+#include <raft/util/cudart_utils.hpp>
+
+namespace matrixone {
+cuvs::distance::DistanceType convert_distance_type(distance_type_t metric_c) {
+    switch (metric_c) {
+        case DistanceType_L2Expanded: return cuvs::distance::DistanceType::L2Expanded;
+        case DistanceType_L2SqrtExpanded: return cuvs::distance::DistanceType::L2SqrtExpanded;
+        case DistanceType_CosineExpanded: return cuvs::distance::DistanceType::CosineExpanded;
+        case DistanceType_L1: return cuvs::distance::DistanceType::L1;
+        case DistanceType_L2Unexpanded: return cuvs::distance::DistanceType::L2Unexpanded;
+        case DistanceType_L2SqrtUnexpanded: return cuvs::distance::DistanceType::L2SqrtUnexpanded;
+        case DistanceType_InnerProduct: return cuvs::distance::DistanceType::InnerProduct;
+        case DistanceType_Linf: return cuvs::distance::DistanceType::Linf;
+        case DistanceType_Canberra: return cuvs::distance::DistanceType::Canberra;
+        case DistanceType_LpUnexpanded: return cuvs::distance::DistanceType::LpUnexpanded;
+        case DistanceType_CorrelationExpanded: return cuvs::distance::DistanceType::CorrelationExpanded;
+        case DistanceType_JaccardExpanded: return cuvs::distance::DistanceType::JaccardExpanded;
+        case DistanceType_HellingerExpanded: return cuvs::distance::DistanceType::HellingerExpanded;
+        case DistanceType_Haversine: return cuvs::distance::DistanceType::Haversine;
+        case DistanceType_BrayCurtis: return cuvs::distance::DistanceType::BrayCurtis;
+        case DistanceType_JensenShannon: return cuvs::distance::DistanceType::JensenShannon;
+        case DistanceType_HammingUnexpanded: return cuvs::distance::DistanceType::HammingUnexpanded;
+        case DistanceType_KLDivergence: return cuvs::distance::DistanceType::KLDivergence;
+        case DistanceType_RusselRaoExpanded: return cuvs::distance::DistanceType::RusselRaoExpanded;
+        case DistanceType_DiceExpanded: return cuvs::distance::DistanceType::DiceExpanded;
+        case DistanceType_BitwiseHamming: return cuvs::distance::DistanceType::BitwiseHamming;
+        case DistanceType_Precomputed: return cuvs::distance::DistanceType::Precomputed;
+        default:
+            throw std::runtime_error("Unknown or unsupported distance type");
+    }
+}
+}
+
+// Vectorized kernel processing 2 elements per thread
+__global__ void f32_to_f16_vectorized_kernel(const float2* src, half2* dst, uint64_t n_pairs) {
+    uint64_t i = blockIdx.x * (uint64_t)blockDim.x + threadIdx.x;
+    if (i < n_pairs) {
+        dst[i] = __float22half2_rn(src[i]);
+    }
+}
+
+// Fallback kernel for the last element if total_elements is odd
+__global__ void f32_to_f16_tail_kernel(const float* src, half* dst, uint64_t index) {
+    dst[index] = __float2half(src[index]);
+}
+
+extern "C" {
+
+int gpu_get_device_count() {
+    int count = 0;
+    cudaError_t err = cudaGetDeviceCount(&count);
+    if (err != cudaSuccess) {
+        return -1;
+    }
+    return count;
+}
+
+int gpu_get_device_list(int* devices, int max_count) {
+    int count = 0;
+    cudaError_t err = cudaGetDeviceCount(&count);
+    if (err != cudaSuccess) {
+        return -1;
+    }
+    int actual_count = (count > max_count) ? max_count : count;
+    for (int i = 0; i < actual_count; ++i) {
+        devices[i] = i;
+    }
+    return actual_count;
+}
+
+void set_errmsg(void* errmsg, const char* prefix, const char* what) {
+    if (errmsg) {
+        std::string err_str = std::string(prefix) + ": " + std::string(what);
+        char* msg = (char*)malloc(err_str.length() + 1);
+        if (msg) {
+            std::strcpy(msg, err_str.c_str());
+            *(static_cast<char**>(errmsg)) = msg;
+        }
+    } else {
+        std::cerr << prefix << ": " << what << std::endl;
+    }
+}
+
+void gpu_convert_f32_to_f16(const float* src, void* dst, uint64_t total_elements, int device_id, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        if (!src || !dst || total_elements == 0) return;
+
+        RAFT_CUDA_TRY(cudaSetDevice(device_id));
+
+        float *d_src = nullptr;
+        half *d_dst = nullptr;
+
+        // Allocate device memory
+        RAFT_CUDA_TRY(cudaMalloc(&d_src, total_elements * sizeof(float)));
+        RAFT_CUDA_TRY(cudaMalloc(&d_dst, total_elements * sizeof(half)));
+
+        // Copy source to device
+        RAFT_CUDA_TRY(cudaMemcpy(d_src, src, total_elements * sizeof(float), cudaMemcpyHostToDevice));
+
+        // Launch vectorized kernel for pairs
+        uint64_t n_pairs = total_elements / 2;
+        if (n_pairs > 0) {
+            uint32_t threads_per_block = 256;
+            uint32_t blocks = (n_pairs + threads_per_block - 1) / threads_per_block;
+            f32_to_f16_vectorized_kernel<<<blocks, threads_per_block>>>((const float2*)d_src, (half2*)d_dst, n_pairs);
+        }
+
+        // Handle the tail if odd
+        if (total_elements % 2 != 0) {
+            f32_to_f16_tail_kernel<<<1, 1>>>(d_src, d_dst, total_elements - 1);
+        }
+        
+        RAFT_CUDA_TRY(cudaPeekAtLastError());
+        RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+        // Copy result back to host
+        RAFT_CUDA_TRY(cudaMemcpy(dst, d_dst, total_elements * sizeof(half), cudaMemcpyDeviceToHost));
+
+        // Free device memory
+        cudaFree(d_src);
+        cudaFree(d_dst);
+
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_convert_f32_to_f16", e.what());
+    }
+}
+
+} // extern "C"
diff --git a/cgo/cuvs/helper.h b/cgo/cuvs/helper.h
new file mode 100644
index 0000000000000..5ce108e6a714e
--- /dev/null
+++ b/cgo/cuvs/helper.h
@@ -0,0 +1,67 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MO_CUVS_C_HELPER_H
+#define MO_CUVS_C_HELPER_H
+
+#include "cuvs_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Returns the number of CUDA-capable devices available.
+ * @return Number of GPU devices.
+ */
+int gpu_get_device_count();
+
+/**
+ * @brief Lists the IDs of available CUDA devices.
+ * @param devices Output array to store device IDs.
+ * @param max_count Maximum number of device IDs to store.
+ * @return Number of device IDs written to the array.
+ */
+int gpu_get_device_list(int* devices, int max_count);
+
+/**
+ * @brief Converts float32 data to float16 (half) on GPU.
+ * @param src Pointer to source float32 data on host or device.
+ * @param dst Pointer to destination float16 data on device.
+ * @param total_elements Total number of elements to convert.
+ * @param device_id ID of the GPU device to use.
+ * @param errmsg Pointer to store error message if any.
+ */
+void gpu_convert_f32_to_f16(const float* src, void* dst, uint64_t total_elements, int device_id, void* errmsg);
+
+/**
+ * @brief Standardized helper to set an error message.
+ * @param errmsg Pointer to the error message destination.
+ * @param prefix Prefix for the error message (e.g., function name).
+ * @param what The actual error description.
+ */
+void set_errmsg(void* errmsg, const char* prefix, const char* what);
+
+#ifdef __cplusplus
+}
+
+#include <cuvs/distance/distance.hpp>
+namespace matrixone {
+    cuvs::distance::DistanceType convert_distance_type(distance_type_t metric_c);
+}
+#endif
+
+#endif // MO_CUVS_C_HELPER_H
diff --git a/cgo/cuvs/ivf_flat.hpp b/cgo/cuvs/ivf_flat.hpp
new file mode 100644
index 0000000000000..b8517934d233e
--- /dev/null
+++ b/cgo/cuvs/ivf_flat.hpp
@@ -0,0 +1,383 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuvs_worker.hpp" // For cuvs_worker_t and raft_handle_wrapper_t
+#include "cuvs_types.h"    // For distance_type_t, ivf_flat_build_params_t, etc.
+#include <raft/util/cudart_utils.hpp> // For RAFT_CUDA_TRY
+#include <cuda_fp16.h> // For half
+
+// Standard library includes
+#include <algorithm>   // For std::copy
+#include <iostream>    // For simulation debug logs
+#include <memory>
+#include <numeric>     // For std::iota
+#include <stdexcept>   // For std::runtime_error
+#include <string>      
+#include <type_traits> 
+#include <vector>
+#include <future>      // For std::promise and std::future
+#include <limits>      // For std::numeric_limits
+#include <shared_mutex> // For std::shared_mutex
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+// RAFT includes
+#include <raft/core/device_mdarray.hpp> // For raft::device_matrix
+#include <raft/core/device_mdspan.hpp>   // Required for device_matrix_view
+#include <raft/core/host_mdarray.hpp> // For raft::host_matrix
+#include <raft/core/resources.hpp>       // Core resource handle
+#include <raft/core/copy.cuh>            // For raft::copy with type conversion
+#include <raft/core/device_resources_snmg.hpp> // For checking SNMG type
+
+// cuVS includes
+#include <cuvs/distance/distance.hpp>    // cuVS distance API
+#include <cuvs/neighbors/ivf_flat.hpp>   // IVF-Flat include
+#pragma GCC diagnostic pop
+
+
+namespace matrixone {
+
+/**
+ * @brief gpu_ivf_flat_t implements an IVF-Flat index that can run on a single GPU or sharded across multiple GPUs.
+ * It automatically chooses between single-GPU and multi-GPU (SNMG) cuVS APIs based on the RAFT handle resources.
+ */
+template <typename T>
+class gpu_ivf_flat_t {
+public:
+    using ivf_flat_index = cuvs::neighbors::ivf_flat::index<T, int64_t>;
+    using mg_index = cuvs::neighbors::mg_index<ivf_flat_index, T, int64_t>;
+
+    std::vector<T> flattened_host_dataset;
+    std::vector<int> devices_;
+    std::string filename_;
+    
+    // Internal index storage
+    std::unique_ptr<ivf_flat_index> index_;
+    std::unique_ptr<mg_index> mg_index_;
+
+    cuvs::distance::DistanceType metric;
+    uint32_t dimension;
+    uint32_t count;
+    ivf_flat_build_params_t build_params;
+    distribution_mode_t dist_mode;
+
+    std::unique_ptr<cuvs_worker_t> worker;
+    std::shared_mutex mutex_;
+    bool is_loaded_ = false;
+    std::shared_ptr<void> dataset_device_ptr_; // Keep device memory alive
+
+    ~gpu_ivf_flat_t() {
+        destroy();
+    }
+
+    // Unified Constructor for building from dataset
+    gpu_ivf_flat_t(const T* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                    cuvs::distance::DistanceType m, const ivf_flat_build_params_t& bp, 
+                    const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode)
+        : dimension(dimension), count(static_cast<uint32_t>(count_vectors)), metric(m), 
+          build_params(bp), dist_mode(mode), devices_(devices) {
+        
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        worker = std::make_unique<cuvs_worker_t>(nthread, devices_, force_mg || (devices_.size() > 1));
+
+        flattened_host_dataset.resize(count * dimension);
+        std::copy(dataset_data, dataset_data + (count * dimension), flattened_host_dataset.begin());
+    }
+
+    // Unified Constructor for loading from file
+    gpu_ivf_flat_t(const std::string& filename, uint32_t dimension, cuvs::distance::DistanceType m, 
+                    const ivf_flat_build_params_t& bp, const std::vector<int>& devices, uint32_t nthread, distribution_mode_t mode)
+        : filename_(filename), dimension(dimension), metric(m), count(0), 
+          build_params(bp), dist_mode(mode), devices_(devices) {
+        
+        bool force_mg = (mode == DistributionMode_SHARDED || mode == DistributionMode_REPLICATED);
+        worker = std::make_unique<cuvs_worker_t>(nthread, devices_, force_mg || (devices_.size() > 1));
+    }
+
+    /**
+     * @brief Loads the index from file or builds it from the dataset.
+     */
+    void load() {
+        std::unique_lock<std::shared_mutex> lock(mutex_);
+        if (is_loaded_) return;
+
+        std::promise<bool> init_complete_promise;
+        std::future<bool> init_complete_future = init_complete_promise.get_future();
+
+        auto init_fn = [&](raft_handle_wrapper_t& handle) -> std::any {
+            auto res = handle.get_raft_resources();
+            bool is_mg = is_snmg_handle(res);
+
+            if (!filename_.empty()) {
+                if (is_mg) {
+                    mg_index_ = std::make_unique<mg_index>(
+                        cuvs::neighbors::ivf_flat::deserialize<T, int64_t>(*res, filename_));
+                    // Update metadata
+                    count = 0;
+                    for (const auto& iface : mg_index_->ann_interfaces_) {
+                        if (iface.index_.has_value()) count += static_cast<uint32_t>(iface.index_.value().size());
+                    }
+                    if (!mg_index_->ann_interfaces_.empty() && mg_index_->ann_interfaces_[0].index_.has_value()) {
+                        build_params.n_lists = static_cast<uint32_t>(mg_index_->ann_interfaces_[0].index_.value().n_lists());
+                    }
+                } else {
+                    cuvs::neighbors::ivf_flat::index_params index_params;
+                    index_params.metric = metric;
+                    index_ = std::make_unique<ivf_flat_index>(*res, index_params, dimension);
+                    cuvs::neighbors::ivf_flat::deserialize(*res, filename_, index_.get());
+                    count = static_cast<uint32_t>(index_->size());
+                    build_params.n_lists = static_cast<uint32_t>(index_->n_lists());
+                }
+                raft::resource::sync_stream(*res);
+            } else if (!flattened_host_dataset.empty()) {
+                if (count < build_params.n_lists) {
+                    throw std::runtime_error("Dataset too small: count (" + std::to_string(count) + 
+                                            ") must be >= n_list (" + std::to_string(build_params.n_lists) + 
+                                            ") to build IVF index.");
+                }
+
+                if (is_mg) {
+                    auto dataset_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                        flattened_host_dataset.data(), (int64_t)count, (int64_t)dimension);
+
+                    cuvs::neighbors::ivf_flat::index_params index_params;
+                    index_params.metric = metric;
+                    index_params.n_lists = build_params.n_lists;
+                    index_params.add_data_on_build = build_params.add_data_on_build;
+                    index_params.kmeans_trainset_fraction = build_params.kmeans_trainset_fraction;
+
+                    cuvs::neighbors::mg_index_params<cuvs::neighbors::ivf_flat::index_params> mg_params(index_params);
+                    if (dist_mode == DistributionMode_REPLICATED) {
+                        mg_params.mode = cuvs::neighbors::distribution_mode::REPLICATED;
+                    } else {
+                        mg_params.mode = cuvs::neighbors::distribution_mode::SHARDED;
+                    }
+
+                    mg_index_ = std::make_unique<mg_index>(
+                        cuvs::neighbors::ivf_flat::build(*res, mg_params, dataset_host_view));
+                } else {
+                    auto dataset_device = new auto(raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(count), static_cast<int64_t>(dimension)));
+                    
+                    dataset_device_ptr_ = std::shared_ptr<void>(dataset_device, [](void* ptr) {
+                        delete static_cast<raft::device_matrix<T, int64_t, raft::layout_c_contiguous>*>(ptr);
+                    });
+
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(dataset_device->data_handle(), flattened_host_dataset.data(),
+                                             flattened_host_dataset.size() * sizeof(T), cudaMemcpyHostToDevice,
+                                             raft::resource::get_cuda_stream(*res)));
+
+                    cuvs::neighbors::ivf_flat::index_params index_params;
+                    index_params.metric = metric;
+                    index_params.n_lists = build_params.n_lists;
+                    index_params.add_data_on_build = build_params.add_data_on_build;
+                    index_params.kmeans_trainset_fraction = build_params.kmeans_trainset_fraction;
+
+                    index_ = std::make_unique<ivf_flat_index>(
+                        cuvs::neighbors::ivf_flat::build(*res, index_params, raft::make_const_mdspan(dataset_device->view())));
+                }
+                raft::resource::sync_stream(*res);
+            }
+
+            init_complete_promise.set_value(true);
+            return std::any();
+        };
+
+        auto stop_fn = [&](raft_handle_wrapper_t& handle) -> std::any {
+            index_.reset();
+            mg_index_.reset();
+            dataset_device_ptr_.reset();
+            return std::any();
+        };
+
+        worker->start(init_fn, stop_fn);
+        init_complete_future.get();
+        is_loaded_ = true;
+    }
+
+    /**
+     * @brief Serializes the index to a file.
+     * @param filename Path to the output file.
+     */
+    void save(const std::string& filename) {
+        if (!is_loaded_ || (!index_ && !mg_index_)) throw std::runtime_error("index not loaded");
+
+        uint64_t job_id = worker->submit(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(mutex_);
+                auto res = handle.get_raft_resources();
+                if (is_snmg_handle(res)) {
+                    cuvs::neighbors::ivf_flat::serialize(*res, *mg_index_, filename);
+                } else {
+                    cuvs::neighbors::ivf_flat::serialize(*res, filename, *index_);
+                }
+                raft::resource::sync_stream(*res);
+                return std::any();
+            }
+        );
+
+        cuvs_task_result_t result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+    }
+
+    /**
+     * @brief Search result containing neighbor IDs and distances.
+     */
+    struct search_result_t {
+        std::vector<int64_t> neighbors; // Indices of nearest neighbors
+        std::vector<float> distances;  // Distances to nearest neighbors
+    };
+
+    /**
+     * @brief Performs IVF-Flat search for given queries.
+     * @param queries_data Pointer to flattened query vectors on host.
+     * @param num_queries Number of query vectors.
+     * @param query_dimension Dimension of query vectors.
+     * @param limit Number of nearest neighbors to find.
+     * @param sp IVF-Flat search parameters.
+     * @return Search results.
+     */
+    search_result_t search(const T* queries_data, uint64_t num_queries, uint32_t query_dimension, 
+                        uint32_t limit, const ivf_flat_search_params_t& sp) {
+        if (!queries_data || num_queries == 0 || dimension == 0) return search_result_t{};
+        if (query_dimension != dimension) throw std::runtime_error("dimension mismatch");
+        if (!is_loaded_ || (!index_ && !mg_index_)) return search_result_t{};
+
+        uint64_t job_id = worker->submit(
+            [&, num_queries, limit, sp](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(mutex_);
+                auto res = handle.get_raft_resources();
+
+                search_result_t search_res;
+                search_res.neighbors.resize(num_queries * limit);
+                search_res.distances.resize(num_queries * limit);
+
+                cuvs::neighbors::ivf_flat::search_params search_params;
+                search_params.n_probes = sp.n_probes;
+
+                if (is_snmg_handle(res)) {
+                    auto queries_host_view = raft::make_host_matrix_view<const T, int64_t>(
+                        queries_data, (int64_t)num_queries, (int64_t)dimension);
+                    auto neighbors_host_view = raft::make_host_matrix_view<int64_t, int64_t>(
+                        search_res.neighbors.data(), (int64_t)num_queries, (int64_t)limit);
+                    auto distances_host_view = raft::make_host_matrix_view<float, int64_t>(
+                        search_res.distances.data(), (int64_t)num_queries, (int64_t)limit);
+
+                    cuvs::neighbors::mg_search_params<cuvs::neighbors::ivf_flat::search_params> mg_search_params(search_params);
+                    cuvs::neighbors::ivf_flat::search(*res, *mg_index_, mg_search_params,
+                                                       queries_host_view, neighbors_host_view, distances_host_view);
+                } else {
+                    auto queries_device = raft::make_device_matrix<T, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(dimension));
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(queries_device.data_handle(), queries_data,
+                                             num_queries * dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                             raft::resource::get_cuda_stream(*res)));
+
+                    auto neighbors_device = raft::make_device_matrix<int64_t, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+                    auto distances_device = raft::make_device_matrix<float, int64_t, raft::layout_c_contiguous>(
+                        *res, static_cast<int64_t>(num_queries), static_cast<int64_t>(limit));
+
+                    cuvs::neighbors::ivf_flat::search(*res, search_params, *index_,
+                                                       raft::make_const_mdspan(queries_device.view()), 
+                                                       neighbors_device.view(), distances_device.view());
+
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.neighbors.data(), neighbors_device.data_handle(),
+                                             search_res.neighbors.size() * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                                             raft::resource::get_cuda_stream(*res)));
+                    RAFT_CUDA_TRY(cudaMemcpyAsync(search_res.distances.data(), distances_device.data_handle(),
+                                             search_res.distances.size() * sizeof(float), cudaMemcpyDeviceToHost,
+                                             raft::resource::get_cuda_stream(*res)));
+                }
+
+                raft::resource::sync_stream(*res);
+
+                for (size_t i = 0; i < search_res.neighbors.size(); ++i) {
+                    if (search_res.neighbors[i] == std::numeric_limits<int64_t>::max() || 
+                        search_res.neighbors[i] == 4294967295LL || search_res.neighbors[i] < 0) {
+                        search_res.neighbors[i] = -1;
+                    }
+                }
+                return search_res;
+            }
+        );
+
+        cuvs_task_result_t result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<search_result_t>(result.result);
+    }
+
+    std::vector<T> get_centers() {
+        if (!is_loaded_ || (!index_ && !mg_index_)) return {};
+
+        uint64_t job_id = worker->submit(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(mutex_);
+                auto res = handle.get_raft_resources();
+                
+                const ivf_flat_index* local_index = nullptr;
+                if (is_snmg_handle(res)) {
+                    for (const auto& iface : mg_index_->ann_interfaces_) {
+                        if (iface.index_.has_value()) { local_index = &iface.index_.value(); break; }
+                    }
+                } else {
+                    local_index = index_.get();
+                }
+
+                if (!local_index) return std::vector<T>{};
+
+                auto centers_view = local_index->centers();
+                size_t n_centers = centers_view.extent(0);
+                size_t dim = centers_view.extent(1);
+                std::vector<T> host_centers(n_centers * dim);
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_centers.data(), centers_view.data_handle(),
+                                         host_centers.size() * sizeof(T), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                raft::resource::sync_stream(*res);
+                return host_centers;
+            }
+        );
+
+        cuvs_task_result_t result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<std::vector<T>>(result.result);
+    }
+
+    uint32_t get_n_list() {
+        std::shared_lock<std::shared_mutex> lock(mutex_);
+        if (!is_loaded_) return build_params.n_lists;
+        
+        if (index_) return static_cast<uint32_t>(index_->n_lists());
+        if (mg_index_) {
+            for (const auto& iface : mg_index_->ann_interfaces_) {
+                if (iface.index_.has_value()) return static_cast<uint32_t>(iface.index_.value().n_lists());
+            }
+        }
+        return build_params.n_lists;
+    }
+
+    void destroy() {
+        if (worker) worker->stop();
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/ivf_flat_c.cpp b/cgo/cuvs/ivf_flat_c.cpp
new file mode 100644
index 0000000000000..8a66cb36c9813
--- /dev/null
+++ b/cgo/cuvs/ivf_flat_c.cpp
@@ -0,0 +1,254 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ivf_flat_c.h"
+#include "ivf_flat.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+
+struct gpu_ivf_flat_any_t {
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_ivf_flat_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_ivf_flat_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_ivf_flat_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_ivf_flat_t<half>*>(ptr); break;
+            case Quantization_INT8: delete static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(ptr); break;
+            case Quantization_UINT8: delete static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_ivf_flat_c gpu_ivf_flat_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                                 distance_type_t metric_c, ivf_flat_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<float>(static_cast<const float*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<half>(static_cast<const half*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<int8_t>(static_cast<const int8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<uint8_t>(static_cast<const uint8_t*>(dataset_data), count_vectors, dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-Flat");
+        }
+        return static_cast<gpu_ivf_flat_c>(new gpu_ivf_flat_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_new", e.what());
+        return nullptr;
+    }
+}
+
+gpu_ivf_flat_c gpu_ivf_flat_load_file(const char* filename, uint32_t dimension, distance_type_t metric_c,
+                                      ivf_flat_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        std::vector<int> devs(devices, devices + device_count);
+        void* ivf_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<float>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_F16:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<half>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_INT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<int8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            case Quantization_UINT8:
+                ivf_ptr = new matrixone::gpu_ivf_flat_t<uint8_t>(std::string(filename), dimension, metric, build_params, devs, nthread, dist_mode);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for IVF-Flat");
+        }
+        return static_cast<gpu_ivf_flat_c>(new gpu_ivf_flat_any_t(qtype, ivf_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_load_file", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_ivf_flat_destroy(gpu_ivf_flat_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_destroy", e.what());
+    }
+}
+
+void gpu_ivf_flat_load(gpu_ivf_flat_c index_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->load(); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->load(); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->load(); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->load(); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_load", e.what());
+    }
+}
+
+void gpu_ivf_flat_save(gpu_ivf_flat_c index_c, const char* filename, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->save(filename); break;
+            case Quantization_F16: static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->save(filename); break;
+            case Quantization_INT8: static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->save(filename); break;
+            case Quantization_UINT8: static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->save(filename); break;
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_save", e.what());
+    }
+}
+
+gpu_ivf_flat_search_res_t gpu_ivf_flat_search(gpu_ivf_flat_c index_c, const void* queries_data, uint64_t num_queries, 
+                                              uint32_t query_dimension, uint32_t limit, 
+                                              ivf_flat_search_params_t search_params, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_ivf_flat_search_res_t res = {nullptr};
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<float>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->search(static_cast<const float*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_F16: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<half>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->search(static_cast<const half*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_INT8: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<int8_t>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->search(static_cast<const int8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            case Quantization_UINT8: {
+                auto* cpp_res = new matrixone::gpu_ivf_flat_t<uint8_t>::search_result_t();
+                *cpp_res = static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->search(static_cast<const uint8_t*>(queries_data), num_queries, query_dimension, limit, search_params);
+                res.result_ptr = static_cast<gpu_ivf_flat_result_c>(cpp_res);
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_search", e.what());
+    }
+    return res;
+}
+
+void gpu_ivf_flat_get_neighbors(gpu_ivf_flat_result_c result_c, uint64_t total_elements, int64_t* neighbors) {
+    if (!result_c) return;
+    // Using float's search_result_t is safe as neighbors is always int64_t
+    auto* neighbors_vec = &static_cast<matrixone::gpu_ivf_flat_t<float>::search_result_t*>(result_c)->neighbors;
+    if (neighbors_vec->size() >= total_elements) {
+        std::copy(neighbors_vec->begin(), neighbors_vec->begin() + total_elements, neighbors);
+    }
+}
+
+void gpu_ivf_flat_get_distances(gpu_ivf_flat_result_c result_c, uint64_t total_elements, float* distances) {
+    if (!result_c) return;
+    // Using float's search_result_t is safe as distances is always float
+    auto* distances_vec = &static_cast<matrixone::gpu_ivf_flat_t<float>::search_result_t*>(result_c)->distances;
+    if (distances_vec->size() >= total_elements) {
+        std::copy(distances_vec->begin(), distances_vec->begin() + total_elements, distances);
+    }
+}
+
+void gpu_ivf_flat_free_result(gpu_ivf_flat_result_c result_c) {
+    if (!result_c) return;
+    delete static_cast<matrixone::gpu_ivf_flat_t<float>::search_result_t*>(result_c);
+}
+
+void gpu_ivf_flat_get_centers(gpu_ivf_flat_c index_c, float* centers, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+        if (any->qtype == Quantization_F32) {
+            auto host_centers = static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->get_centers();
+            std::copy(host_centers.begin(), host_centers.end(), centers);
+        } else if (any->qtype == Quantization_F16) {
+            auto host_centers = static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->get_centers();
+            for (size_t i = 0; i < host_centers.size(); ++i) centers[i] = (float)host_centers[i];
+        } else if (any->qtype == Quantization_INT8) {
+            auto host_centers = static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->get_centers();
+            for (size_t i = 0; i < host_centers.size(); ++i) centers[i] = (float)host_centers[i];
+        } else if (any->qtype == Quantization_UINT8) {
+            auto host_centers = static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->get_centers();
+            for (size_t i = 0; i < host_centers.size(); ++i) centers[i] = (float)host_centers[i];
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_ivf_flat_get_centers", e.what());
+    }
+}
+
+uint32_t gpu_ivf_flat_get_n_list(gpu_ivf_flat_c index_c) {
+    if (!index_c) return 0;
+    auto* any = static_cast<gpu_ivf_flat_any_t*>(index_c);
+    switch (any->qtype) {
+        case Quantization_F32: return static_cast<matrixone::gpu_ivf_flat_t<float>*>(any->ptr)->get_n_list();
+        case Quantization_F16: return static_cast<matrixone::gpu_ivf_flat_t<half>*>(any->ptr)->get_n_list();
+        case Quantization_INT8: return static_cast<matrixone::gpu_ivf_flat_t<int8_t>*>(any->ptr)->get_n_list();
+        case Quantization_UINT8: return static_cast<matrixone::gpu_ivf_flat_t<uint8_t>*>(any->ptr)->get_n_list();
+        default: return 0;
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_ivf_flat_t<float>;
+template class gpu_ivf_flat_t<half>;
+template class gpu_ivf_flat_t<int8_t>;
+template class gpu_ivf_flat_t<uint8_t>;
+}
diff --git a/cgo/cuvs/ivf_flat_c.h b/cgo/cuvs/ivf_flat_c.h
new file mode 100644
index 0000000000000..deb81588a50ba
--- /dev/null
+++ b/cgo/cuvs/ivf_flat_c.h
@@ -0,0 +1,80 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef IVF_FLAT_C_H
+#define IVF_FLAT_C_H
+
+#include "helper.h"
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_ivf_flat_t object
+typedef void* gpu_ivf_flat_c;
+
+// Opaque pointer to the C++ IVF-Flat search result object
+typedef void* gpu_ivf_flat_result_c;
+
+// Constructor for building from dataset
+gpu_ivf_flat_c gpu_ivf_flat_new(const void* dataset_data, uint64_t count_vectors, uint32_t dimension, 
+                                 distance_type_t metric, ivf_flat_build_params_t build_params,
+                                 const int* devices, int device_count, uint32_t nthread, 
+                                 distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Constructor for loading from file
+gpu_ivf_flat_c gpu_ivf_flat_load_file(const char* filename, uint32_t dimension, distance_type_t metric,
+                                      ivf_flat_build_params_t build_params,
+                                      const int* devices, int device_count, uint32_t nthread, 
+                                      distribution_mode_t dist_mode, quantization_t qtype, void* errmsg);
+
+// Destructor
+void gpu_ivf_flat_destroy(gpu_ivf_flat_c index_c, void* errmsg);
+
+// Load function (actually triggers the build/load logic)
+void gpu_ivf_flat_load(gpu_ivf_flat_c index_c, void* errmsg);
+
+// Save function
+void gpu_ivf_flat_save(gpu_ivf_flat_c index_c, const char* filename, void* errmsg);
+
+// Search function
+typedef struct {
+    gpu_ivf_flat_result_c result_ptr;
+} gpu_ivf_flat_search_res_t;
+
+gpu_ivf_flat_search_res_t gpu_ivf_flat_search(gpu_ivf_flat_c index_c, const void* queries_data, uint64_t num_queries, 
+                                              uint32_t query_dimension, uint32_t limit, 
+                                              ivf_flat_search_params_t search_params, void* errmsg);
+
+// Get results from result object
+void gpu_ivf_flat_get_neighbors(gpu_ivf_flat_result_c result_c, uint64_t total_elements, int64_t* neighbors);
+void gpu_ivf_flat_get_distances(gpu_ivf_flat_result_c result_c, uint64_t total_elements, float* distances);
+
+// Free result object
+void gpu_ivf_flat_free_result(gpu_ivf_flat_result_c result_c);
+
+// Gets the trained centroids
+void gpu_ivf_flat_get_centers(gpu_ivf_flat_c index_c, float* centers, void* errmsg);
+
+// Gets the number of lists (centroids)
+uint32_t gpu_ivf_flat_get_n_list(gpu_ivf_flat_c index_c);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // IVF_FLAT_C_H
diff --git a/cgo/cuvs/kmeans.hpp b/cgo/cuvs/kmeans.hpp
new file mode 100644
index 0000000000000..cc8dbb28b86c5
--- /dev/null
+++ b/cgo/cuvs/kmeans.hpp
@@ -0,0 +1,273 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cuvs_worker.hpp" // For cuvs_worker_t and raft_handle_wrapper_t
+#include "cuvs_types.h"    // For distance_type_t and quantization_t
+#include <raft/util/cudart_utils.hpp> // For RAFT_CUDA_TRY
+#include <cuda_fp16.h> // For half
+
+// Standard library includes
+#include <algorithm>   
+#include <memory>
+#include <vector>
+#include <future>      
+#include <shared_mutex> 
+#include <optional>
+#include <type_traits>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+// RAFT includes
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/map.cuh>
+
+// cuVS includes
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/cluster/kmeans.hpp>
+#pragma GCC diagnostic pop
+
+namespace matrixone {
+
+/**
+ * @brief gpu_kmeans_t implements K-Means clustering on GPU using cuVS.
+ */
+template <typename T>
+class gpu_kmeans_t {
+public:
+    uint32_t n_clusters;
+    uint32_t dimension;
+    
+    cuvs::cluster::kmeans::balanced_params params;
+
+    // Type of centroids and inertia. cuVS uses float for these even if input is half, int8, or uint8.
+    using CentroidT = float; 
+
+    // Internal storage for centroids on device
+    std::unique_ptr<raft::device_matrix<CentroidT, int64_t>> centroids_;
+    std::unique_ptr<cuvs_worker_t> worker;
+    std::shared_mutex mutex_;
+
+    gpu_kmeans_t(uint32_t n_clusters, uint32_t dimension, cuvs::distance::DistanceType metric,
+                 int max_iter = 20, int device_id = 0, uint32_t nthread = 1)
+        : n_clusters(n_clusters), dimension(dimension) {
+        
+        params.n_iters = static_cast<uint32_t>(max_iter);
+        params.metric = metric;
+
+        // K-Means in cuVS is currently single-GPU focused in the main cluster API
+        worker = std::make_unique<cuvs_worker_t>(nthread, device_id);
+        worker->start();
+    }
+
+    ~gpu_kmeans_t() {
+        destroy();
+    }
+
+    struct fit_result_t {
+        float inertia;
+        int64_t n_iter;
+    };
+
+    /**
+     * @brief Computes the cluster centroids.
+     */
+    fit_result_t fit(const T* X_data, uint64_t n_samples) {
+        if (!X_data || n_samples == 0) return {0, 0};
+
+        uint64_t job_id = worker->submit(
+            [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                std::unique_lock<std::shared_mutex> lock(mutex_);
+                auto res = handle.get_raft_resources();
+                
+                auto X_device = raft::make_device_matrix<T, int64_t>(
+                    *res, static_cast<int64_t>(n_samples), static_cast<int64_t>(dimension));
+                
+                RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data,
+                                         n_samples * dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                if (!centroids_) {
+                    centroids_ = std::make_unique<raft::device_matrix<CentroidT, int64_t>>(
+                        raft::make_device_matrix<CentroidT, int64_t>(*res, static_cast<int64_t>(n_clusters), static_cast<int64_t>(dimension)));
+                }
+
+                cuvs::cluster::kmeans::fit(*res, params, 
+                                           raft::make_const_mdspan(X_device.view()), 
+                                           centroids_->view());
+
+                raft::resource::sync_stream(*res);
+                return fit_result_t{0.0f, static_cast<int64_t>(params.n_iters)};
+            }
+        );
+        auto result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<fit_result_t>(result.result);
+    }
+
+    struct predict_result_t {
+        std::vector<int64_t> labels;
+        float inertia;
+    };
+
+    /**
+     * @brief Assigns labels to new data based on existing centroids.
+     */
+    predict_result_t predict(const T* X_data, uint64_t n_samples) {
+        if (!X_data || n_samples == 0) return {{}, 0};
+
+        uint64_t job_id = worker->submit(
+            [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(mutex_);
+                if (!centroids_) throw std::runtime_error("KMeans centroids not trained. Call fit() first.");
+
+                auto res = handle.get_raft_resources();
+                
+                auto X_device = raft::make_device_matrix<T, int64_t>(
+                    *res, static_cast<int64_t>(n_samples), static_cast<int64_t>(dimension));
+                
+                RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data,
+                                         n_samples * dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                predict_result_t res_out;
+                res_out.labels.resize(n_samples);
+                auto labels_device = raft::make_device_vector<uint32_t, int64_t>(*res, static_cast<int64_t>(n_samples));
+                
+                cuvs::cluster::kmeans::predict(*res, params,
+                                               raft::make_const_mdspan(X_device.view()),
+                                               raft::make_const_mdspan(centroids_->view()),
+                                               labels_device.view());
+
+                std::vector<uint32_t> host_labels(n_samples);
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(),
+                                         n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                
+                raft::resource::sync_stream(*res);
+                for(uint64_t i=0; i<n_samples; ++i) res_out.labels[i] = (int64_t)host_labels[i];
+                res_out.inertia = 0.0f;
+                return res_out;
+            }
+        );
+        auto result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<predict_result_t>(result.result);
+    }
+
+    struct fit_predict_result_t {
+        std::vector<int64_t> labels;
+        float inertia;
+        int64_t n_iter;
+    };
+
+    /**
+     * @brief Performs both fitting and labeling in one step.
+     */
+    fit_predict_result_t fit_predict(const T* X_data, uint64_t n_samples) {
+        if (!X_data || n_samples == 0) return {{}, 0, 0};
+
+        uint64_t job_id = worker->submit(
+            [&, X_data, n_samples](raft_handle_wrapper_t& handle) -> std::any {
+                std::unique_lock<std::shared_mutex> lock(mutex_);
+                auto res = handle.get_raft_resources();
+                
+                auto X_device = raft::make_device_matrix<T, int64_t>(
+                    *res, static_cast<int64_t>(n_samples), static_cast<int64_t>(dimension));
+                
+                RAFT_CUDA_TRY(cudaMemcpyAsync(X_device.data_handle(), X_data,
+                                         n_samples * dimension * sizeof(T), cudaMemcpyHostToDevice,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                if (!centroids_) {
+                    centroids_ = std::make_unique<raft::device_matrix<CentroidT, int64_t>>(
+                        raft::make_device_matrix<CentroidT, int64_t>(*res, static_cast<int64_t>(n_clusters), static_cast<int64_t>(dimension)));
+                }
+
+                fit_predict_result_t res_out;
+                res_out.labels.resize(n_samples);
+                auto labels_device = raft::make_device_vector<uint32_t, int64_t>(*res, static_cast<int64_t>(n_samples));
+
+                if constexpr (std::is_same_v<T, float> || std::is_same_v<T, int8_t>) {
+                    cuvs::cluster::kmeans::fit_predict(*res, params,
+                                                       raft::make_const_mdspan(X_device.view()),
+                                                       centroids_->view(),
+                                                       labels_device.view());
+                } else {
+                    // Fallback for half and uint8_t which might missing fit_predict overload in some cuVS versions
+                    cuvs::cluster::kmeans::fit(*res, params,
+                                               raft::make_const_mdspan(X_device.view()),
+                                               centroids_->view());
+                    cuvs::cluster::kmeans::predict(*res, params,
+                                                   raft::make_const_mdspan(X_device.view()),
+                                                   raft::make_const_mdspan(centroids_->view()),
+                                                   labels_device.view());
+                }
+
+                std::vector<uint32_t> host_labels(n_samples);
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_labels.data(), labels_device.data_handle(),
+                                         n_samples * sizeof(uint32_t), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+                
+                raft::resource::sync_stream(*res);
+                for(uint64_t i=0; i<n_samples; ++i) res_out.labels[i] = (int64_t)host_labels[i];
+                res_out.inertia = 0.0f;
+                res_out.n_iter = static_cast<int64_t>(params.n_iters);
+                return res_out;
+            }
+        );
+        auto result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<fit_predict_result_t>(result.result);
+    }
+
+    /**
+     * @brief Returns the trained centroids.
+     */
+    std::vector<CentroidT> get_centroids() {
+        uint64_t job_id = worker->submit(
+            [&](raft_handle_wrapper_t& handle) -> std::any {
+                std::shared_lock<std::shared_mutex> lock(mutex_);
+                if (!centroids_) return std::vector<CentroidT>{};
+
+                auto res = handle.get_raft_resources();
+                std::vector<CentroidT> host_centroids(n_clusters * dimension);
+
+                RAFT_CUDA_TRY(cudaMemcpyAsync(host_centroids.data(), centroids_->data_handle(),
+                                         host_centroids.size() * sizeof(CentroidT), cudaMemcpyDeviceToHost,
+                                         raft::resource::get_cuda_stream(*res)));
+
+                raft::resource::sync_stream(*res);
+                return host_centroids;
+            }
+        );
+        auto result = worker->wait(job_id).get();
+        if (result.error) std::rethrow_exception(result.error);
+        return std::any_cast<std::vector<CentroidT>>(result.result);
+    }
+
+    void destroy() {
+        if (worker) worker->stop();
+    }
+};
+
+} // namespace matrixone
diff --git a/cgo/cuvs/kmeans_c.cpp b/cgo/cuvs/kmeans_c.cpp
new file mode 100644
index 0000000000000..04009437afc64
--- /dev/null
+++ b/cgo/cuvs/kmeans_c.cpp
@@ -0,0 +1,264 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kmeans_c.h"
+#include "kmeans.hpp"
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+
+struct gpu_kmeans_any_t {
+    quantization_t qtype;
+    void* ptr;
+
+    gpu_kmeans_any_t(quantization_t q, void* p) : qtype(q), ptr(p) {}
+    ~gpu_kmeans_any_t() {
+        switch (qtype) {
+            case Quantization_F32: delete static_cast<matrixone::gpu_kmeans_t<float>*>(ptr); break;
+            case Quantization_F16: delete static_cast<matrixone::gpu_kmeans_t<half>*>(ptr); break;
+            case Quantization_INT8: delete static_cast<matrixone::gpu_kmeans_t<int8_t>*>(ptr); break;
+            case Quantization_UINT8: delete static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(ptr); break;
+            default: break;
+        }
+    }
+};
+
+extern "C" {
+
+gpu_kmeans_c gpu_kmeans_new(uint32_t n_clusters, uint32_t dimension, distance_type_t metric_c,
+                            int max_iter, int device_id, uint32_t nthread, 
+                            quantization_t qtype, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        cuvs::distance::DistanceType metric = matrixone::convert_distance_type(metric_c);
+        void* kmeans_ptr = nullptr;
+        switch (qtype) {
+            case Quantization_F32:
+                kmeans_ptr = new matrixone::gpu_kmeans_t<float>(n_clusters, dimension, metric, max_iter, device_id, nthread);
+                break;
+            case Quantization_F16:
+                kmeans_ptr = new matrixone::gpu_kmeans_t<half>(n_clusters, dimension, metric, max_iter, device_id, nthread);
+                break;
+            case Quantization_INT8:
+                kmeans_ptr = new matrixone::gpu_kmeans_t<int8_t>(n_clusters, dimension, metric, max_iter, device_id, nthread);
+                break;
+            case Quantization_UINT8:
+                kmeans_ptr = new matrixone::gpu_kmeans_t<uint8_t>(n_clusters, dimension, metric, max_iter, device_id, nthread);
+                break;
+            default:
+                throw std::runtime_error("Unsupported quantization type for KMeans");
+        }
+        return static_cast<gpu_kmeans_c>(new gpu_kmeans_any_t(qtype, kmeans_ptr));
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_new", e.what());
+        return nullptr;
+    }
+}
+
+void gpu_kmeans_destroy(gpu_kmeans_c kmeans_c, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        delete any;
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_destroy", e.what());
+    }
+}
+
+gpu_kmeans_fit_res_t gpu_kmeans_fit(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_kmeans_fit_res_t res = {0.0f, 0};
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto cpp_res = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->fit(static_cast<const float*>(X_data), n_samples);
+                res.inertia = cpp_res.inertia;
+                res.n_iter = cpp_res.n_iter;
+                break;
+            }
+            case Quantization_F16: {
+                auto cpp_res = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->fit(static_cast<const half*>(X_data), n_samples);
+                res.inertia = cpp_res.inertia;
+                res.n_iter = cpp_res.n_iter;
+                break;
+            }
+            case Quantization_INT8: {
+                auto cpp_res = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->fit(static_cast<const int8_t*>(X_data), n_samples);
+                res.inertia = cpp_res.inertia;
+                res.n_iter = cpp_res.n_iter;
+                break;
+            }
+            case Quantization_UINT8: {
+                auto cpp_res = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->fit(static_cast<const uint8_t*>(X_data), n_samples);
+                res.inertia = cpp_res.inertia;
+                res.n_iter = cpp_res.n_iter;
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_fit", e.what());
+    }
+    return res;
+}
+
+gpu_kmeans_predict_res_t gpu_kmeans_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_kmeans_predict_res_t res = {nullptr, 0.0f};
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto* cpp_res = new matrixone::gpu_kmeans_t<float>::predict_result_t();
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->predict(static_cast<const float*>(X_data), n_samples);
+                res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+                res.inertia = cpp_res->inertia;
+                break;
+            }
+            case Quantization_F16: {
+                auto* cpp_res = new matrixone::gpu_kmeans_t<half>::predict_result_t();
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->predict(static_cast<const half*>(X_data), n_samples);
+                res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+                res.inertia = (float)cpp_res->inertia;
+                break;
+            }
+            case Quantization_INT8: {
+                auto* cpp_res = new matrixone::gpu_kmeans_t<int8_t>::predict_result_t();
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->predict(static_cast<const int8_t*>(X_data), n_samples);
+                res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+                res.inertia = cpp_res->inertia;
+                break;
+            }
+            case Quantization_UINT8: {
+                auto* cpp_res = new matrixone::gpu_kmeans_t<uint8_t>::predict_result_t();
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->predict(static_cast<const uint8_t*>(X_data), n_samples);
+                res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+                res.inertia = cpp_res->inertia;
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_predict", e.what());
+    }
+    return res;
+}
+
+gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    gpu_kmeans_fit_predict_res_t res = {nullptr, 0.0f, 0};
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto* cpp_res = new matrixone::gpu_kmeans_t<float>::fit_predict_result_t();
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->fit_predict(static_cast<const float*>(X_data), n_samples);
+                res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+                res.inertia = cpp_res->inertia;
+                res.n_iter = cpp_res->n_iter;
+                break;
+            }
+            case Quantization_F16: {
+                auto* cpp_res = new matrixone::gpu_kmeans_t<half>::fit_predict_result_t();
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->fit_predict(static_cast<const half*>(X_data), n_samples);
+                res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+                res.inertia = (float)cpp_res->inertia;
+                res.n_iter = cpp_res->n_iter;
+                break;
+            }
+            case Quantization_INT8: {
+                auto* cpp_res = new matrixone::gpu_kmeans_t<int8_t>::fit_predict_result_t();
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->fit_predict(static_cast<const int8_t*>(X_data), n_samples);
+                res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+                res.inertia = cpp_res->inertia;
+                res.n_iter = cpp_res->n_iter;
+                break;
+            }
+            case Quantization_UINT8: {
+                auto* cpp_res = new matrixone::gpu_kmeans_t<uint8_t>::fit_predict_result_t();
+                *cpp_res = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->fit_predict(static_cast<const uint8_t*>(X_data), n_samples);
+                res.result_ptr = static_cast<gpu_kmeans_result_c>(cpp_res);
+                res.inertia = cpp_res->inertia;
+                res.n_iter = cpp_res->n_iter;
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_fit_predict", e.what());
+    }
+    return res;
+}
+
+void gpu_kmeans_get_labels(gpu_kmeans_result_c result_c, uint64_t n_samples, int64_t* labels) {
+    if (!result_c) return;
+    // Both predict_result_t and fit_predict_result_t have labels as their first member
+    auto* labels_vec = &static_cast<matrixone::gpu_kmeans_t<float>::predict_result_t*>(result_c)->labels;
+    if (labels_vec->size() >= n_samples) {
+        std::copy(labels_vec->begin(), labels_vec->begin() + n_samples, labels);
+    }
+}
+
+void gpu_kmeans_free_result(gpu_kmeans_result_c result_c) {
+    if (!result_c) return;
+    // Using float's predict_result_t is safe as labels is same
+    delete static_cast<matrixone::gpu_kmeans_t<float>::predict_result_t*>(result_c);
+}
+
+void gpu_kmeans_get_centroids(gpu_kmeans_c kmeans_c, void* centroids, void* errmsg) {
+    if (errmsg) *(static_cast<char**>(errmsg)) = nullptr;
+    try {
+        auto* any = static_cast<gpu_kmeans_any_t*>(kmeans_c);
+        switch (any->qtype) {
+            case Quantization_F32: {
+                auto host_centroids = static_cast<matrixone::gpu_kmeans_t<float>*>(any->ptr)->get_centroids();
+                std::copy(host_centroids.begin(), host_centroids.end(), static_cast<float*>(centroids));
+                break;
+            }
+            case Quantization_F16: {
+                auto host_centroids = static_cast<matrixone::gpu_kmeans_t<half>*>(any->ptr)->get_centroids();
+                std::copy(host_centroids.begin(), host_centroids.end(), static_cast<float*>(centroids));
+                break;
+            }
+            case Quantization_INT8: {
+                auto host_centroids = static_cast<matrixone::gpu_kmeans_t<int8_t>*>(any->ptr)->get_centroids();
+                std::copy(host_centroids.begin(), host_centroids.end(), static_cast<float*>(centroids));
+                break;
+            }
+            case Quantization_UINT8: {
+                auto host_centroids = static_cast<matrixone::gpu_kmeans_t<uint8_t>*>(any->ptr)->get_centroids();
+                std::copy(host_centroids.begin(), host_centroids.end(), static_cast<float*>(centroids));
+                break;
+            }
+            default: break;
+        }
+    } catch (const std::exception& e) {
+        set_errmsg(errmsg, "Error in gpu_kmeans_get_centroids", e.what());
+    }
+}
+
+} // extern "C"
+
+namespace matrixone {
+template class gpu_kmeans_t<float>;
+template class gpu_kmeans_t<half>;
+template class gpu_kmeans_t<int8_t>;
+template class gpu_kmeans_t<uint8_t>;
+}
diff --git a/cgo/cuvs/kmeans_c.h b/cgo/cuvs/kmeans_c.h
new file mode 100644
index 0000000000000..f67fdcf0981b9
--- /dev/null
+++ b/cgo/cuvs/kmeans_c.h
@@ -0,0 +1,79 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KMEANS_C_H
+#define KMEANS_C_H
+
+#include "helper.h"
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque pointer to the C++ gpu_kmeans_t object
+typedef void* gpu_kmeans_c;
+
+// Opaque pointer to the C++ KMeans result object
+typedef void* gpu_kmeans_result_c;
+
+// Constructor
+gpu_kmeans_c gpu_kmeans_new(uint32_t n_clusters, uint32_t dimension, distance_type_t metric,
+                            int max_iter, int device_id, uint32_t nthread, 
+                            quantization_t qtype, void* errmsg);
+
+// Destructor
+void gpu_kmeans_destroy(gpu_kmeans_c kmeans_c, void* errmsg);
+
+// Fit function
+typedef struct {
+    float inertia;
+    int64_t n_iter;
+} gpu_kmeans_fit_res_t;
+
+gpu_kmeans_fit_res_t gpu_kmeans_fit(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg);
+
+// Predict function
+typedef struct {
+    gpu_kmeans_result_c result_ptr;
+    float inertia;
+} gpu_kmeans_predict_res_t;
+
+gpu_kmeans_predict_res_t gpu_kmeans_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg);
+
+// FitPredict function
+typedef struct {
+    gpu_kmeans_result_c result_ptr;
+    float inertia;
+    int64_t n_iter;
+} gpu_kmeans_fit_predict_res_t;
+
+gpu_kmeans_fit_predict_res_t gpu_kmeans_fit_predict(gpu_kmeans_c kmeans_c, const void* X_data, uint64_t n_samples, void* errmsg);
+
+// Get results from result object
+void gpu_kmeans_get_labels(gpu_kmeans_result_c result_c, uint64_t n_samples, int64_t* labels);
+
+// Free result object
+void gpu_kmeans_free_result(gpu_kmeans_result_c result_c);
+
+// Get centroids
+void gpu_kmeans_get_centroids(gpu_kmeans_c kmeans_c, void* centroids, void* errmsg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // KMEANS_C_H
diff --git a/cgo/cuvs/test/brute_force_test.cu b/cgo/cuvs/test/brute_force_test.cu
new file mode 100644
index 0000000000000..5c03bda22fa80
--- /dev/null
+++ b/cgo/cuvs/test/brute_force_test.cu
@@ -0,0 +1,212 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "brute_force.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cuda_fp16.h>
+
+using namespace matrixone;
+
+// --- Helper to convert float to half ---
+static std::vector<half> float_to_half(const std::vector<float>& src) {
+    std::vector<half> dst(src.size());
+    for (size_t i = 0; i < src.size(); ++i) {
+        dst[i] = __float2half(src[i]);
+    }
+    return dst;
+}
+
+// --- GpuBruteForceTest ---
+
+TEST(GpuBruteForceTest, BasicLoadAndSearch) {
+    const uint32_t dimension = 3;
+    const uint64_t count = 2;
+    std::vector<float> dataset = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+    
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.load();
+
+    std::vector<float> queries = {1.0, 2.0, 3.0};
+    auto result = index.search(queries.data(), 1, dimension, 1);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)1);
+    ASSERT_EQ(result.neighbors[0], 0);
+    ASSERT_EQ(result.distances[0], 0.0);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, SearchWithMultipleQueries) {
+    const uint32_t dimension = 4;
+    const uint64_t count = 4;
+    std::vector<float> dataset = {
+        1.0, 0.0, 0.0, 0.0, // ID 0
+        0.0, 1.0, 0.0, 0.0, // ID 1
+        0.0, 0.0, 1.0, 0.0, // ID 2
+        0.0, 0.0, 0.0, 1.0  // ID 3
+    };
+    
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.load();
+
+    std::vector<float> queries = {
+        1.0, 0.0, 0.0, 0.0, // Should match ID 0
+        0.0, 0.0, 1.0, 0.0  // Should match ID 2
+    };
+    auto result = index.search(queries.data(), 2, dimension, 1);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)2);
+    ASSERT_EQ(result.neighbors[0], 0);
+    ASSERT_EQ(result.neighbors[1], 2);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, SearchWithFloat16) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 2;
+    std::vector<float> f_dataset = {1.0, 1.0, 2.0, 2.0};
+    std::vector<half> h_dataset = float_to_half(f_dataset);
+    
+    gpu_brute_force_t<half> index(h_dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.load();
+
+    std::vector<float> f_queries = {1.0, 1.0};
+    std::vector<half> h_queries = float_to_half(f_queries);
+    auto result = index.search(h_queries.data(), 1, dimension, 1);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)1);
+    ASSERT_EQ(result.neighbors[0], 0);
+    ASSERT_EQ(result.distances[0], 0.0);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, SearchWithInnerProduct) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 2;
+    std::vector<float> dataset = {
+        1.0, 0.0,
+        0.0, 1.0
+    };
+    
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::InnerProduct, 1, 0);
+    index.load();
+
+    std::vector<float> queries = {1.0, 0.0};
+    auto result = index.search(queries.data(), 1, dimension, 2);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)2);
+    ASSERT_EQ(result.neighbors[0], 0);
+    ASSERT_EQ(result.neighbors[1], 1);
+    
+    // dot product should be 1.0 for exact match
+    ASSERT_TRUE(std::abs(result.distances[0] - 1.0) < 1e-5);
+    ASSERT_TRUE(std::abs(result.distances[1] - 0.0) < 1e-5);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, EmptyDataset) {
+    const uint32_t dimension = 128;
+    const uint64_t count = 0;
+    
+    gpu_brute_force_t<float> index(nullptr, count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.load();
+
+    std::vector<float> queries(dimension, 0.0);
+    auto result = index.search(queries.data(), 1, dimension, 5);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)0);
+
+    index.destroy();
+}
+
+TEST(GpuBruteForceTest, LargeLimit) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 5;
+    std::vector<float> dataset(count * dimension, 1.0);
+    
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.load();
+
+    std::vector<float> queries(dimension, 1.0);
+    uint32_t limit = 10;
+    auto result = index.search(queries.data(), 1, dimension, limit);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)limit);
+    for (int i = 0; i < 5; ++i) ASSERT_GE(result.neighbors[i], 0);
+    for (int i = 5; i < 10; ++i) ASSERT_EQ((int64_t)result.neighbors[i], (int64_t)-1);
+
+    index.destroy();
+}
+
+// --- CuvsWorkerTest ---
+
+TEST(CuvsWorkerTest, BruteForceSearch) {
+    uint32_t n_threads = 1;
+    cuvs_worker_t worker(n_threads, 0); // Added device_id
+    worker.start();
+
+    const uint32_t dimension = 128;
+    const uint64_t count = 1000;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 1, 0);
+    index.load();
+
+    std::vector<float> queries = std::vector<float>(dataset.begin(), dataset.begin() + dimension);
+    auto result = index.search(queries.data(), 1, dimension, 5);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0);
+
+    index.destroy();
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, ConcurrentSearches) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 100;
+    std::vector<float> dataset(count * dimension);
+    // Use very distinct values to ensure unique neighbors
+    for (size_t i = 0; i < count; ++i) {
+        for (size_t j = 0; j < dimension; ++j) {
+            dataset[i * dimension + j] = (float)i * 100.0f + (float)j;
+        }
+    }
+
+    gpu_brute_force_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, 4, 0);
+    index.load();
+
+    const int num_threads = 4;
+    std::vector<std::future<void>> futures;
+    for (int i = 0; i < num_threads; ++i) {
+        futures.push_back(std::async(std::launch::async, [&index, dimension, &dataset, i]() {
+            std::vector<float> query = std::vector<float>(dataset.begin() + i * dimension, dataset.begin() + (i + 1) * dimension);
+            auto res = index.search(query.data(), 1, dimension, 1);
+            ASSERT_EQ(res.neighbors[0], (int64_t)i);
+        }));
+    }
+
+    for (auto& f : futures) f.get();
+
+    index.destroy();
+}
diff --git a/cgo/cuvs/test/cagra_test.cu b/cgo/cuvs/test/cagra_test.cu
new file mode 100644
index 0000000000000..92e4762919fcd
--- /dev/null
+++ b/cgo/cuvs/test/cagra_test.cu
@@ -0,0 +1,101 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "cagra.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+
+using namespace matrixone;
+
+TEST(GpuCagraTest, BasicLoadAndSearch) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 100;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    
+    std::vector<int> devices = {0};
+    cagra_build_params_t bp = cagra_build_params_default();
+    gpu_cagra_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+    index.load();
+
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    cagra_search_params_t sp = cagra_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0);
+
+    index.destroy();
+}
+
+TEST(GpuCagraTest, SaveAndLoadFromFile) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 100;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    std::string filename = "test_cagra.bin";
+    std::vector<int> devices = {0};
+
+    // 1. Build and Save
+    {
+        cagra_build_params_t bp = cagra_build_params_default();
+        gpu_cagra_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.load();
+        index.save(filename);
+        index.destroy();
+    }
+
+    // 2. Load and Search
+    {
+        cagra_build_params_t bp = cagra_build_params_default();
+        gpu_cagra_t<float> index(filename, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.load();
+        
+        std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+        cagra_search_params_t sp = cagra_search_params_default();
+        auto result = index.search(queries.data(), 1, dimension, 5, sp);
+        
+        ASSERT_EQ(result.neighbors.size(), (size_t)5);
+        ASSERT_EQ(result.neighbors[0], 0);
+
+        index.destroy();
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(GpuCagraTest, ShardedModeSimulation) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 100;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+    
+    std::vector<int> devices = {0}; 
+    cagra_build_params_t bp = cagra_build_params_default();
+    gpu_cagra_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SHARDED);
+    index.load();
+
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    cagra_search_params_t sp = cagra_search_params_default();
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0);
+
+    index.destroy();
+}
diff --git a/cgo/cuvs/test/ivf_flat_test.cu b/cgo/cuvs/test/ivf_flat_test.cu
new file mode 100644
index 0000000000000..18ab4c1586f6d
--- /dev/null
+++ b/cgo/cuvs/test/ivf_flat_test.cu
@@ -0,0 +1,120 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "ivf_flat.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+
+using namespace matrixone;
+
+TEST(GpuIvfFlatTest, BasicLoadSearchAndCenters) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 4;
+    std::vector<float> dataset = {
+        1.0, 1.0,
+        1.1, 1.1,
+        100.0, 100.0,
+        101.0, 101.0
+    };
+    
+    std::vector<int> devices = {0};
+    ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+    bp.n_lists = 2;
+    gpu_ivf_flat_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+    index.load();
+
+    // Verify centers
+    auto centers = index.get_centers();
+    ASSERT_EQ(centers.size(), (size_t)(2 * dimension));
+    TEST_LOG("IVF-Flat Centers: " << centers[0] << ", " << centers[1]);
+
+    std::vector<float> queries = {1.05, 1.05};
+    ivf_flat_search_params_t sp = ivf_flat_search_params_default();
+    sp.n_probes = 2;
+    auto result = index.search(queries.data(), 1, dimension, 2, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)2);
+    // Should be either 0 or 1
+    ASSERT_TRUE(result.neighbors[0] == 0 || result.neighbors[0] == 1);
+
+    index.destroy();
+}
+
+TEST(GpuIvfFlatTest, SaveAndLoadFromFile) {
+    const uint32_t dimension = 2;
+    const uint64_t count = 4;
+    std::vector<float> dataset = {1.0, 1.0, 1.1, 1.1, 100.0, 100.0, 101.0, 101.0};
+    std::string filename = "test_ivf_flat.bin";
+    std::vector<int> devices = {0};
+
+    // 1. Build and Save
+    {
+        ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+        bp.n_lists = 2;
+        gpu_ivf_flat_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.load();
+        index.save(filename);
+        index.destroy();
+    }
+
+    // 2. Load and Search
+    {
+        ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+        bp.n_lists = 2;
+        gpu_ivf_flat_t<float> index(filename, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SINGLE_GPU);
+        index.load();
+        
+        std::vector<float> queries = {100.5, 100.5};
+        ivf_flat_search_params_t sp = ivf_flat_search_params_default();
+        sp.n_probes = 2;
+        auto result = index.search(queries.data(), 1, dimension, 2, sp);
+        
+        ASSERT_EQ(result.neighbors.size(), (size_t)2);
+        ASSERT_TRUE(result.neighbors[0] == 2 || result.neighbors[0] == 3);
+
+        index.destroy();
+    }
+
+    std::remove(filename.c_str());
+}
+
+TEST(GpuIvfFlatTest, ShardedModeSimulation) {
+    const uint32_t dimension = 16;
+    const uint64_t count = 100;
+    std::vector<float> dataset(count * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)i / dataset.size();
+    
+    std::vector<int> devices = {0}; 
+    ivf_flat_build_params_t bp = ivf_flat_build_params_default();
+    bp.n_lists = 5;
+    gpu_ivf_flat_t<float> index(dataset.data(), count, dimension, cuvs::distance::DistanceType::L2Expanded, bp, devices, 1, DistributionMode_SHARDED);
+    index.load();
+
+    auto centers = index.get_centers();
+    ASSERT_EQ(centers.size(), (size_t)(5 * dimension));
+
+    std::vector<float> queries(dataset.begin(), dataset.begin() + dimension);
+    ivf_flat_search_params_t sp = ivf_flat_search_params_default();
+    sp.n_probes = 2;
+    auto result = index.search(queries.data(), 1, dimension, 5, sp);
+
+    ASSERT_EQ(result.neighbors.size(), (size_t)5);
+    ASSERT_EQ(result.neighbors[0], 0);
+
+    index.destroy();
+}
diff --git a/cgo/cuvs/test/kmeans_test.cu b/cgo/cuvs/test/kmeans_test.cu
new file mode 100644
index 0000000000000..c8f00068f8fe2
--- /dev/null
+++ b/cgo/cuvs/test/kmeans_test.cu
@@ -0,0 +1,86 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "kmeans.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+using namespace matrixone;
+
+TEST(GpuKMeansTest, BasicFitAndPredict) {
+    const uint32_t n_clusters = 3;
+    const uint32_t dimension = 2;
+    const uint64_t n_samples = 9;
+
+    // Create 3 clusters of points
+    std::vector<float> dataset = {
+        0.1f, 0.1f,   0.0f, 0.2f,   0.2f, 0.0f,  // Cluster 0
+        10.1f, 10.1f, 10.0f, 10.2f, 10.2f, 10.0f, // Cluster 1
+        20.1f, 20.1f, 20.0f, 20.2f, 20.2f, 20.0f  // Cluster 2
+    };
+
+    gpu_kmeans_t<float> kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1);
+
+    auto fit_res = kmeans.fit(dataset.data(), n_samples);
+    ASSERT_GE(fit_res.n_iter, 1);
+
+    auto predict_res = kmeans.predict(dataset.data(), n_samples);
+    ASSERT_EQ(predict_res.labels.size(), (size_t)n_samples);
+
+    // Since we use balanced_params, it might prioritize balancing cluster sizes over spatial distance 
+    // on very small datasets. We just check that all labels are within range [0, nClusters).
+    for (size_t i = 0; i < n_samples; ++i) {
+        ASSERT_TRUE(predict_res.labels[i] >= 0 && predict_res.labels[i] < (int64_t)n_clusters);
+    }
+
+    kmeans.destroy();
+}
+
+TEST(GpuKMeansTest, FitPredict) {
+    const uint32_t n_clusters = 2;
+    const uint32_t dimension = 4;
+    const uint64_t n_samples = 10;
+    std::vector<float> dataset(n_samples * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+
+    gpu_kmeans_t<float> kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1);
+
+    auto res = kmeans.fit_predict(dataset.data(), n_samples);
+    ASSERT_EQ(res.labels.size(), (size_t)n_samples);
+    ASSERT_GE(res.n_iter, 1);
+
+    kmeans.destroy();
+}
+
+TEST(GpuKMeansTest, GetCentroids) {
+    const uint32_t n_clusters = 5;
+    const uint32_t dimension = 8;
+    const uint64_t n_samples = 50;
+    std::vector<float> dataset(n_samples * dimension);
+    for (size_t i = 0; i < dataset.size(); ++i) dataset[i] = (float)rand() / RAND_MAX;
+
+    gpu_kmeans_t<float> kmeans(n_clusters, dimension, cuvs::distance::DistanceType::L2Expanded, 20, 0, 1);
+
+    kmeans.fit(dataset.data(), n_samples);
+    auto centroids = kmeans.get_centroids();
+
+    ASSERT_EQ(centroids.size(), (size_t)(n_clusters * dimension));
+
+    kmeans.destroy();
+}
diff --git a/cgo/cuvs/test/main_test.cu b/cgo/cuvs/test/main_test.cu
new file mode 100644
index 0000000000000..a2b8ecbd23cd9
--- /dev/null
+++ b/cgo/cuvs/test/main_test.cu
@@ -0,0 +1,186 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuvs_worker.hpp"
+#include "test_framework.hpp"
+#include <cstdio>
+#include <cstdlib>
+
+using namespace matrixone;
+
+thread_local bool current_test_failed = false;
+
+// --- thread_safe_queue_t Tests ---
+
+TEST(ThreadSafeQueueTest, BasicPushPop) {
+    thread_safe_queue_t<int> q;
+    q.push(1);
+    q.push(2);
+
+    int val;
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 1);
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 2);
+}
+
+TEST(ThreadSafeQueueTest, PopEmptyBlocking) {
+    thread_safe_queue_t<int> q;
+    int val = 0;
+
+    auto fut = std::async(std::launch::async, [&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        q.push(42);
+    });
+
+    ASSERT_TRUE(q.pop(val));
+    ASSERT_EQ(val, 42);
+}
+
+TEST(ThreadSafeQueueTest, StopQueue) {
+    thread_safe_queue_t<int> q;
+    int val;
+
+    auto fut = std::async(std::launch::async, [&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        q.stop();
+    });
+
+    ASSERT_FALSE(q.pop(val)); // Should return false after stop
+    ASSERT_TRUE(q.is_stopped());
+}
+
+// --- cuvs_task_result_store_t Tests ---
+
+TEST(CuvsTaskResultStoreTest, BasicStoreRetrieve) {
+    cuvs_task_result_store_t store;
+    uint64_t id = store.get_next_job_id();
+    
+    cuvs_task_result_t res{id, 100, nullptr};
+    store.store(res);
+
+    auto fut = store.wait(id);
+    auto retrieved = fut.get();
+    ASSERT_EQ(std::any_cast<int>(retrieved.result), 100);
+}
+
+TEST(CuvsTaskResultStoreTest, AsyncWait) {
+    cuvs_task_result_store_t store;
+    uint64_t id = store.get_next_job_id();
+
+    auto fut = store.wait(id);
+    
+    std::thread t([&]() {
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        store.store({id, std::string("async"), nullptr});
+    });
+
+    auto retrieved = fut.get();
+    ASSERT_EQ(std::any_cast<std::string>(retrieved.result), std::string("async"));
+    t.join();
+}
+
+TEST(CuvsTaskResultStoreTest, StopStore) {
+    cuvs_task_result_store_t store;
+    uint64_t id = store.get_next_job_id();
+    auto fut = store.wait(id);
+
+    store.stop();
+    
+    ASSERT_THROW(fut.get(), std::runtime_error);
+}
+
+// --- raft_handle_wrapper_t and is_snmg_handle Tests ---
+
+TEST(RaftHandleWrapperTest, DetectSingleGpu) {
+    std::vector<int> devices = {0};
+    raft_handle_wrapper_t wrapper(devices, false); // force_mg = false
+    ASSERT_FALSE(is_snmg_handle(wrapper.get_raft_resources()));
+}
+
+TEST(RaftHandleWrapperTest, DetectMultiGpuForced) {
+    std::vector<int> devices = {0};
+    raft_handle_wrapper_t wrapper(devices, true); // force_mg = true
+    ASSERT_TRUE(is_snmg_handle(wrapper.get_raft_resources()));
+}
+
+// --- cuvs_worker_t Tests ---
+
+TEST(CuvsWorkerTest, BasicLifecycle) {
+    uint32_t n_threads = 1;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, SubmitTask) {
+    uint32_t n_threads = 1;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+
+    auto task = [](raft_handle_wrapper_t&) -> std::any {
+        return std::string("success");
+    };
+
+    uint64_t job_id = worker.submit(task);
+    auto result = worker.wait(job_id).get();
+
+    ASSERT_EQ(std::any_cast<std::string>(result.result), std::string("success"));
+
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, MultipleThreads) {
+    uint32_t n_threads = 4;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+
+    std::vector<uint64_t> ids;
+    for (int i = 0; i < 10; ++i) {
+        ids.push_back(worker.submit([i](raft_handle_wrapper_t&) -> std::any {
+            return i * 2;
+        }));
+    }
+
+    for (int i = 0; i < 10; ++i) {
+        auto res = worker.wait(ids[i]).get();
+        ASSERT_EQ(std::any_cast<int>(res.result), i * 2);
+    }
+
+    worker.stop();
+}
+
+TEST(CuvsWorkerTest, TaskErrorHandling) {
+    uint32_t n_threads = 1;
+    cuvs_worker_t worker(n_threads);
+    worker.start();
+
+    auto fail_task = [](raft_handle_wrapper_t&) -> std::any {
+        throw std::runtime_error("task failed intentionally");
+    };
+
+    uint64_t job_id = worker.submit(fail_task);
+    auto result = worker.wait(job_id).get();
+
+    ASSERT_TRUE(result.error != nullptr);
+    ASSERT_TRUE(has_exception<std::runtime_error>(result.error));
+
+    worker.stop();
+}
+
+int main() {
+    return RUN_ALL_TESTS();
+}
diff --git a/cgo/cuvs/test/test_framework.hpp b/cgo/cuvs/test/test_framework.hpp
new file mode 100644
index 0000000000000..f995f514686da
--- /dev/null
+++ b/cgo/cuvs/test/test_framework.hpp
@@ -0,0 +1,150 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <functional>
+#include <chrono>
+#include <numeric> // For std::iota
+#include <future>  // For std::async
+#include <atomic>
+#include <map>
+#include <stdexcept>
+#include <sstream> // For building string messages
+#include <algorithm> // For std::sort
+#include <any> // For std::any comparisons in assertions
+
+// --- Minimal Custom Test Framework (Stub for compilation) ---
+
+// Logging - minimal versions
+#define TEST_LOG(msg) std::cout << "[INFO    ] " << msg << std::endl
+#define TEST_ERROR(msg) std::cerr << "[ERROR   ] " << msg << std::endl
+
+// Global flag to indicate if the current test has failed (kept minimal)
+extern thread_local bool current_test_failed;
+
+// Helper to build string messages for assertions (handles various types)
+template <typename T>
+std::string to_string_for_assertion(const T& val) {
+    std::ostringstream oss;
+    oss << val;
+    return oss.str();
+}
+inline std::string to_string_for_assertion(const std::any&) { return "std::any"; } // Simplified
+inline std::string to_string_for_assertion(const char* val) { return std::string(val); }
+
+// Helper to check if an exception_ptr holds a specific exception type (kept minimal)
+template <typename E>
+inline bool has_exception(const std::exception_ptr& ep) {
+    if (!ep) return false;
+    try {
+        std::rethrow_exception(ep);
+    } catch (const E& e) {
+        return true;
+    } catch (...) {
+        return false;
+    }
+}
+
+// Assertions - simplified to just return/log if condition is false
+#define REPORT_FAILURE(msg_str) do { TEST_ERROR(msg_str); current_test_failed = true; return; } while (0)
+#define ASSERT_TRUE(condition) do { if (!(condition)) { REPORT_FAILURE("ASSERT_TRUE failed: " #condition); } } while (0)
+#define ASSERT_FALSE(condition) ASSERT_TRUE(!(condition))
+#define ASSERT_EQ(val1, val2) do { \
+    auto v1 = (val1); \
+    auto v2 = (val2); \
+    if (!(v1 == v2)) { \
+        std::ostringstream oss; \
+        oss << "ASSERT_EQ failed: " << #val1 << " (" << v1 << ") vs " << #val2 << " (" << v2 << ")"; \
+        REPORT_FAILURE(oss.str()); \
+    } \
+} while (0)
+#define ASSERT_NE(val1, val2) do { if (!((val1) != (val2))) { REPORT_FAILURE("ASSERT_NE failed: " #val1 " vs " #val2); } } while (0)
+#define ASSERT_GE(val1, val2) do { if (!((val1) >= (val2))) { REPORT_FAILURE("ASSERT_GE failed: " #val1 " vs " #val2); } } while (0)
+#define ASSERT_THROW(statement, expected_exception) do { bool caught = false; try { statement; } catch (const expected_exception&) { caught = true; } if (!caught) { REPORT_FAILURE("ASSERT_THROW failed"); } } while (0)
+#define ASSERT_NO_THROW(statement) do { try { statement; } catch (...) { REPORT_FAILURE("ASSERT_NO_THROW failed"); } } while (0)
+
+// Test registration
+struct TestCase {
+    std::string name;
+    std::function<void()> func;
+    bool failed = false;
+};
+
+inline std::vector<TestCase>& get_test_cases() {
+    static std::vector<TestCase> test_cases;
+    return test_cases;
+}
+
+// Simplified TEST macro for compilation
+#define TEST(suite, name) \
+    static void test_func_##suite##_##name(); \
+    struct RegisterTest_##suite##_##name { \
+        RegisterTest_##suite##_##name() { \
+            get_test_cases().push_back({#suite "::" #name, test_func_##suite##_##name}); \
+        } \
+    }; \
+    static RegisterTest_##suite##_##name register_test_##suite##_##name; \
+    static void test_func_##suite##_##name()
+
+inline int RUN_ALL_TESTS() {
+    int passed_count = 0;
+    int failed_count = 0;
+    TEST_LOG("Running " << get_test_cases().size() << " tests (minimal framework)...");
+
+    for (auto& test_case : get_test_cases()) {
+        current_test_failed = false; // Reset for each test
+        TEST_LOG("[ RUN      ] " << test_case.name);
+        try {
+            test_case.func();
+        } catch (const std::exception& e) {
+            TEST_ERROR("Test threw unhandled exception: " << e.what());
+            current_test_failed = true;
+        } catch (...) {
+            TEST_ERROR("Test threw unhandled unknown exception.");
+            current_test_failed = true;
+        }
+
+        if (current_test_failed) {
+            test_case.failed = true;
+            failed_count++;
+            TEST_LOG("[  FAILED  ] " << test_case.name);
+        } else {
+            passed_count++;
+            TEST_LOG("[       OK ] " << test_case.name);
+        }
+    }
+
+    TEST_LOG("--------------------------------------------------");
+    TEST_LOG("[==========] " << passed_count + failed_count << " tests ran.");
+    TEST_LOG("[  PASSED  ] " << passed_count << " tests.");
+    if (failed_count > 0) {
+        TEST_ERROR("[  FAILED  ] " << failed_count << " tests, listed below:");
+        for (const auto& test_case : get_test_cases()) {
+            if (test_case.failed) {
+                TEST_ERROR("    " << test_case.name);
+            }
+        }
+    }
+    TEST_LOG("--------------------------------------------------");
+
+    return failed_count;
+}
+
+// --- End of Minimal Custom Test Framework (Stub for compilation) ---
diff --git a/cgo/test/Makefile b/cgo/test/Makefile
index 506722a91f6e6..f0de3ac25285f 100644
--- a/cgo/test/Makefile
+++ b/cgo/test/Makefile
@@ -1,18 +1,47 @@
-CFLAGS=-I.. -g -Wall -Werror -lm -I../../thirdparties/install/include
+UNAME_S := $(shell uname -s)
 
-all: test_add.exe test_bloom.exe test_varlena.exe bloom_whole_test.exe
+ifeq ($(MO_CL_CUDA),1)
+        ifeq ($(CONDA_PREFIX),)
+                $(error CONDA_PREFIX env variable not found. Please activate your conda environment.)
+        endif
+        CC = /usr/local/cuda/bin/nvcc
+	COMPILER_FLAGS := -Xcompiler "-Wall -Werror"
+	# When using nvcc to link, we need to pass the libraries and rpath
+	LINKER_FLAGS := -Xlinker "-rpath=$(shell realpath ..)"
+	# We must also include the cuVS and other deps that libmo.so needs if linked statically, 
+	# but since libmo.so is shared, we just need to link against it.
+	LIBS += -L.. -lmo -L../../thirdparties/install/lib -lusearch_c -L$(CUDA_PATH)/lib64/stubs -lcuda -L$(CUDA_PATH)/lib64 -lcudart
+	LIBS += -L$(CONDA_PREFIX)/lib -lcuvs -lcuvs_c -ldl -lrmm -lpthread  -lgomp
+	LIBS += -Xlinker -lpthread -Xlinker -lm
+else
+	COMPILER_FLAGS := -Wall -Werror
+	ifeq ($(UNAME_S),Darwin)
+		LINKER_FLAGS := -Wl,-rpath,$(shell realpath ..)
+	else
+		LINKER_FLAGS := -Wl,-rpath=$(shell realpath ..)
+	endif
+	LIBS := -L.. -lmo -L../../thirdparties/install/lib -lusearch_c -lm -lstdc++
+	ifneq ($(UNAME_S),Darwin)
+		LIBS += -fopenmp
+	endif
+endif
 
-test_add.exe: test_add.c ../libmo.a
-	$(CC) $(CFLAGS) -o test_add.exe test_add.c -L.. -lmo
+CFLAGS := -I.. -g -I../../thirdparties/install/include $(COMPILER_FLAGS)
+LDFLAGS := $(LIBS) $(LINKER_FLAGS)
 
-test_bloom.exe: test_bloom.c ../libmo.a
-	$(CC) $(CFLAGS) -o test_bloom.exe test_bloom.c -L.. -lmo
+all: test_add.exe test_bloom.exe test_varlena.exe
 
-test_varlena.exe: varlena_test.c ../libmo.a
-	$(CC) $(CFLAGS) -o test_varlena.exe varlena_test.c -L.. -lmo
+test_add.exe: test_add.c 
+	$(CC) $(CFLAGS) -o $@ test_add.c $(LDFLAGS)
 
-bloom_whole_test.exe: bloom_whole_test.c ../libmo.a
-	$(CC) $(CFLAGS) -o bloom_whole_test.exe bloom_whole_test.c -L.. -lmo
+test_bloom.exe: test_bloom.c
+	$(CC) $(CFLAGS) -o $@ test_bloom.c $(LDFLAGS)
+
+test_varlena.exe: varlena_test.c
+	$(CC) $(CFLAGS) -o $@ varlena_test.c $(LDFLAGS)
+
+bloom_whole_test.exe: bloom_whole_test.c 
+	$(CC) $(CFLAGS) $(NVCC_FLAGS) -o bloom_whole_test.exe bloom_whole_test.c $(LDFLAGS)
 
 clean:
 	rm -f *.o *.exe
diff --git a/cgo/test/bloom_whole_test.c b/cgo/test/bloom_whole_test.c
new file mode 100644
index 0000000000000..23bf08586f94d
--- /dev/null
+++ b/cgo/test/bloom_whole_test.c
@@ -0,0 +1,122 @@
+/* 
+ * Copyright 2021 Matrix Origin
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include "../bloom.h"
+#include "../varlena.h"
+
+// Helper to create a packed buffer of varlenas
+int create_test_buffer(uint8_t *buffer, uint8_t *area) {
+    uint8_t *ptr = buffer;
+    int nitem = 0;
+
+    // --- Element 1: small ---
+    const char *str1 = "apple";
+    uint8_t len1 = strlen(str1);
+    ptr[0] = len1;
+    memcpy(ptr + 1, str1, len1);
+    ptr += VARLENA_SIZE;
+    nitem++;
+
+    // --- Element 2: big ---
+    const char *str2 = "banana_long_string_to_test_big_varlena";
+    uint32_t len2 = strlen(str2);
+    uint32_t offset2 = 50; 
+    memcpy(area + offset2, str2, len2);
+    
+    varlena_set_big_offset_len(ptr, offset2, len2);
+    ptr += VARLENA_SIZE;
+    nitem++;
+
+    // --- Element 3: small ---
+    const char *str3 = "cherry";
+    uint8_t len3 = strlen(str3);
+    ptr[0] = len3;
+    memcpy(ptr + 1, str3, len3);
+    ptr += VARLENA_SIZE;
+    nitem++;
+    
+    return nitem;
+}
+
+void test_add_and_test_varlena() {
+    printf("--- Running test_add_and_test_varlena ---\n");
+    
+    bloomfilter_t *bf = bloomfilter_init(1000, 3);
+    assert(bf != NULL);
+
+    uint8_t buffer[200];
+    uint8_t area[200];
+    int nitem = create_test_buffer(buffer, area);
+
+    // Add all items from the buffer
+    bloomfilter_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0);
+
+    // Test if all added items exist
+    bool results[nitem];
+    bloomfilter_test_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results);
+    
+    for (int i = 0; i < nitem; i++) {
+        assert(results[i]);
+    }
+
+    // Test for a non-existent item
+    const char *str_not_exist = "grape";
+    assert(!bloomfilter_test(bf, str_not_exist, strlen(str_not_exist)));
+
+    bloomfilter_free(bf);
+    printf("test_add_and_test_whole passed.\n\n");
+}
+
+void test_test_and_add_varlena() {
+    printf("--- Running test_test_and_add_varlena ---\n");
+
+    bloomfilter_t *bf = bloomfilter_init(1000, 3);
+    assert(bf != NULL);
+
+    uint8_t buffer[200];
+    uint8_t area[200];
+    int nitem = create_test_buffer(buffer, area);
+    
+    bool results1[nitem];
+    bool results2[nitem];
+
+    // First call: should report all items as non-existent and add them
+    bloomfilter_test_and_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results2);
+    for (int i = 0; i < nitem; i++) {
+        assert(!results1[i]);
+    }
+
+    // Second call: should report all items as existent
+    bloomfilter_test_and_add_varlena(bf, buffer, sizeof(buffer), VARLENA_SIZE, nitem, area, sizeof(area), NULL, 0, results2);
+    for (int i = 0; i < nitem; i++) {
+        assert(results2[i]);
+    }
+
+    bloomfilter_free(bf);
+    printf("test_test_and_add_whole passed.\n\n");
+}
+
+int main() {
+    test_add_and_test_varlena();
+    test_test_and_add_varlena();
+    printf("All bloom_varlena_test passed!\n");
+    return 0;
+}
diff --git a/go.mod b/go.mod
index d03aa82937328..d1dcf1ba27f2d 100644
--- a/go.mod
+++ b/go.mod
@@ -23,7 +23,7 @@ require (
 	github.com/aws/smithy-go v1.22.1
 	github.com/axiomhq/hyperloglog v0.0.0-20230201085229-3ddf4bad03dc
 	github.com/buger/jsonparser v1.1.1
-	github.com/bytedance/sonic v1.14.2
+	github.com/bytedance/sonic v1.15.0
 	github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5
 	github.com/cespare/xxhash/v2 v2.3.0
 	github.com/charmbracelet/bubbletea v1.3.10
@@ -76,7 +76,6 @@ require (
 	github.com/prashantv/gostub v1.1.0
 	github.com/prometheus/client_golang v1.17.0
 	github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16
-	github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d
 	github.com/robfig/cron/v3 v3.0.1
 	github.com/samber/lo v1.38.1
 	github.com/segmentio/encoding v0.4.0
@@ -92,8 +91,7 @@ require (
 	github.com/tidwall/btree v1.7.0
 	github.com/tidwall/pretty v1.2.1
 	github.com/tmc/langchaingo v0.1.13
-	github.com/unum-cloud/usearch/golang v0.0.0-20260106013029-7306bb446be5
-	github.com/viterin/partial v1.1.0
+	github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9
 	go.starlark.net v0.0.0-20250701195324-d457b4515e0e
 	go.uber.org/automaxprocs v1.5.3
 	go.uber.org/ratelimit v0.2.0
@@ -134,7 +132,7 @@ require (
 	github.com/bits-and-blooms/bitset v1.22.0 // indirect
 	github.com/bufbuild/protocompile v0.6.0 // indirect
 	github.com/bytedance/gopkg v0.1.3 // indirect
-	github.com/bytedance/sonic/loader v0.4.0 // indirect
+	github.com/bytedance/sonic/loader v0.5.0 // indirect
 	github.com/cespare/xxhash v1.1.0 // indirect
 	github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc // indirect
 	github.com/charmbracelet/lipgloss v1.1.0 // indirect
@@ -260,9 +258,6 @@ replace (
 	github.com/lni/dragonboat/v4 v4.0.0-20220815145555-6f622e8bcbef => github.com/matrixorigin/dragonboat/v4 v4.0.0-20251214113216-2ddf81ef2a85
 	github.com/lni/goutils v1.3.1-0.20220604063047-388d67b4dbc4 => github.com/matrixorigin/goutils v1.3.1-0.20220604063047-388d67b4dbc4
 	github.com/lni/vfs v0.2.1-0.20220616104132-8852fd867376 => github.com/matrixorigin/vfs v0.2.1-0.20220616104132-8852fd867376
-
-	github.com/rapidsai/cuvs/go v0.0.0-20251126145430-91c51b1cc43d => github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6
-	github.com/unum-cloud/usearch/golang v0.0.0-20260106013029-7306bb446be5 => github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9
 )
 
 replace github.com/shoenig/go-m1cpu => github.com/shoenig/go-m1cpu v0.1.7
diff --git a/go.sum b/go.sum
index fbd20a58d4537..8821ade189a9a 100644
--- a/go.sum
+++ b/go.sum
@@ -127,10 +127,10 @@ github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMU
 github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
 github.com/bytedance/gopkg v0.1.3 h1:TPBSwH8RsouGCBcMBktLt1AymVo2TVsBVCY4b6TnZ/M=
 github.com/bytedance/gopkg v0.1.3/go.mod h1:576VvJ+eJgyCzdjS+c4+77QF3p7ubbtiKARP3TxducM=
-github.com/bytedance/sonic v1.14.2 h1:k1twIoe97C1DtYUo+fZQy865IuHia4PR5RPiuGPPIIE=
-github.com/bytedance/sonic v1.14.2/go.mod h1:T80iDELeHiHKSc0C9tubFygiuXoGzrkjKzX2quAx980=
-github.com/bytedance/sonic/loader v0.4.0 h1:olZ7lEqcxtZygCK9EKYKADnpQoYkRQxaeY2NYzevs+o=
-github.com/bytedance/sonic/loader v0.4.0/go.mod h1:AR4NYCk5DdzZizZ5djGqQ92eEhCCcdf5x77udYiSJRo=
+github.com/bytedance/sonic v1.15.0 h1:/PXeWFaR5ElNcVE84U0dOHjiMHQOwNIx3K4ymzh/uSE=
+github.com/bytedance/sonic v1.15.0/go.mod h1:tFkWrPz0/CUCLEF4ri4UkHekCIcdnkqXw9VduqpJh0k=
+github.com/bytedance/sonic/loader v0.5.0 h1:gXH3KVnatgY7loH5/TkeVyXPfESoqSBSBEiDd5VjlgE=
+github.com/bytedance/sonic/loader v0.5.0/go.mod h1:AR4NYCk5DdzZizZ5djGqQ92eEhCCcdf5x77udYiSJRo=
 github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5 h1:BjkPE3785EwPhhyuFkbINB+2a1xATwk8SNDWnJiD41g=
 github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5/go.mod h1:jtAfVaU/2cu1+wdSRPWE2c1N2qeAA3K4RH9pYgqwets=
 github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4=
@@ -207,12 +207,8 @@ github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8Nz
 github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
 github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI=
 github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
-github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6 h1:hn6US40835XeZRilkHLIUpWTF2RYBRXCpBLn1PPOSjg=
-github.com/cpegeric/cuvs/go v0.0.0-20251215111627-7e6a0b54cda6/go.mod h1:Ju9l9IcIHZOPLO1tjN9dEYSgEPFowDPF9pM70W9nNGs=
 github.com/cpegeric/pdftotext-go v0.0.0-20241112123704-49cb86a3790e h1:tQSCiEjYPRU+AuuVR+zd+xYVOsEqX1clPhmIAM6FCHU=
 github.com/cpegeric/pdftotext-go v0.0.0-20241112123704-49cb86a3790e/go.mod h1:zt7uTOYu0EEeKatGaTi9JiP0I9ePHpDvjAwpfPXh/N0=
-github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9 h1:jnClZ1ddCpjYQLMem6YSlVm7Ois6sXbRr2CP6n/rc/s=
-github.com/cpegeric/usearch/golang v0.0.0-20260116111453-124ac7861dc9/go.mod h1:3SN8SakyyBWzb14DNZn4t5yX8dOa7ae45KpqDioi4RA=
 github.com/cpuguy83/dockercfg v0.3.1 h1:/FpZ+JaygUR/lZP2NlFI2DVfrOEMAIKP5wWEJdoYe9E=
 github.com/cpuguy83/dockercfg v0.3.1/go.mod h1:sugsbF4//dDlL/i+S+rtpIWp+5h0BHJHfjj5/jFyUJc=
 github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
@@ -877,6 +873,8 @@ github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGr
 github.com/ugorji/go v1.1.7/go.mod h1:kZn38zHttfInRq0xu/PH0az30d+z6vm202qpg1oXVMw=
 github.com/ugorji/go/codec v0.0.0-20181204163529-d75b2dcb6bc8/go.mod h1:VFNgLljTbGfSG7qAOspJ7OScBnGdDN/yBr0sguwnwf0=
 github.com/ugorji/go/codec v1.1.7/go.mod h1:Ax+UKWsSmolVDwsd+7N3ZtXu+yMGCf907BLYF3GoBXY=
+github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9 h1:KtfoWJQXPrvEfFCuk1FGgiPfBoIhSIqiTLaZLHjoKM4=
+github.com/unum-cloud/usearch/golang v0.0.0-20260216134828-40d127f472e9/go.mod h1:NxBpQibuBBeA/V8RGbrNzVAv4OyWWL5yNao7mVz656k=
 github.com/urfave/negroni v1.0.0/go.mod h1:Meg73S6kFm/4PpbYdq35yYWoCZ9mS/YSx+lKnmiohz4=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
 github.com/valyala/fasthttp v1.6.0/go.mod h1:FstJa9V+Pj9vQ7OJie2qMHdwemEDaDiSdBnvPM1Su9w=
@@ -889,8 +887,6 @@ github.com/valyala/histogram v1.2.0/go.mod h1:Hb4kBwb4UxsaNbbbh+RRz8ZR6pdodR57tz
 github.com/valyala/tcplisten v0.0.0-20161114210144-ceec8f93295a/go.mod h1:v3UYOV9WzVtRmSR+PDvWpU/qWl4Wa5LApYYX4ZtKbio=
 github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8=
 github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM=
-github.com/viterin/partial v1.1.0 h1:iH1l1xqBlapXsYzADS1dcbizg3iQUKTU1rbwkHv/80E=
-github.com/viterin/partial v1.1.0/go.mod h1:oKGAo7/wylWkJTLrWX8n+f4aDPtQMQ6VG4dd2qur5QA=
 github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
 github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo=
 github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU=
diff --git a/optools/images/Dockerfile b/optools/images/Dockerfile
index 837b501811348..7383c0941b937 100644
--- a/optools/images/Dockerfile
+++ b/optools/images/Dockerfile
@@ -32,6 +32,7 @@ FROM matrixorigin/ubuntu:22.04
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/mo-service /mo-service
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/etc /etc
 COPY --from=builder /go/src/github.com/matrixorigin/matrixone/thirdparties/install/lib/*.so /usr/local/lib
+COPY --from=builder /go/src/github.com/matrixorigin/matrixone/cgo/*.so /usr/local/lib
 
 # ldconfig and run mo-service to check if the shared library is found
 RUN ldconfig && /mo-service -h
diff --git a/optools/images/gpu/Dockerfile b/optools/images/gpu/Dockerfile
index 8e3640083e614..3549a0d249d70 100644
--- a/optools/images/gpu/Dockerfile
+++ b/optools/images/gpu/Dockerfile
@@ -8,7 +8,7 @@ RUN export LANG=en_US.utf8
 ARG DEBIAN_FRONTEND=noninteractive
 ENV MOHOME=/matrixone
 ENV PATH="/usr/local/cuda/bin:${PATH}"
-ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${MOHOME}/thirdparties/install/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:${MOHOME}/thirdparties/install/lib:${MOHOME}/cgo:${LD_LIBRARY_PATH}"
 
 WORKDIR /matrixone
 COPY . .
@@ -52,6 +52,7 @@ FROM nvidia/cuda:13.0.2-cudnn-runtime-ubuntu24.04
 COPY --from=builder /matrixone/mo-service /mo-service
 COPY --from=builder /matrixone/etc /etc
 COPY --from=builder /matrixone/thirdparties/install/lib/*.so /usr/local/lib
+COPY --from=builder /matrixone/cgo/*.so /usr/local/lib
 COPY --from=builder /root/miniconda/envs/go/lib /root/miniconda/envs/go/lib
 
 ENV PATH="/usr/local/cuda/bin:${PATH}"
diff --git a/optools/run_ut.sh b/optools/run_ut.sh
index a8a8205891efe..aa7307fd3c424 100755
--- a/optools/run_ut.sh
+++ b/optools/run_ut.sh
@@ -47,6 +47,27 @@ UT_COUNT="$G_WKSP/$G_TS-UT-Count.out"
 CODE_COVERAGE="$G_WKSP/$G_TS-UT-Coverage.html"
 RAW_COVERAGE="coverage.out"
 IS_BUILD_FAIL=""
+TAGS="matrixone_test"
+
+THIRDPARTIES_INSTALL_DIR=${BUILD_WKSP}/thirdparties/install
+CGO_CFLAGS="-I${BUILD_WKSP}/cgo -I${THIRDPARTIES_INSTALL_DIR}/include"
+CGO_LDFLAGS="-Wl,-rpath,${THIRDPARTIES_INSTALL_DIR}/lib:${BUILD_WKSP}/cgo -L${THIRDPARTIES_INSTALL_DIR}/lib -L${BUILD_WKSP}/cgo -lmo -lusearch_c -lm"
+LD_LIBRARY_PATH="${THIRDPARTIES_INSTALL_DIR}/lib:${BUILD_WKSP}/cgo"
+
+if [[ -n "${MO_CL_CUDA:-}" ]] ; then
+    if [[ ${MO_CL_CUDA} == "1" ]] ; then
+         if [[ -z "${CONDA_PREFIX:-}" ]] ; then
+		 echo "CONDA_PREFIX environment variable not found"
+		 exit 1
+	 fi
+
+         CUDA_HOME=/usr/local/cuda
+         CGO_CFLAGS="${CGO_CFLAGS} -I${CUDA_HOME}/include -I${CONDA_PREFIX}/include"
+         CGO_LDFLAGS="${CGO_LDFLAGS} -L${CUDA_HOME}/lib64/stubs -lcuda -L${CUDA_HOME}/lib64 -lcudart -L${CONDA_PREFIX}/lib -lcuvs -lcuvs_c -lstdc++"
+         LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${CONDA_PREFIX}/lib"
+	 TAGS="${TAGS},gpu"
+    fi
+fi
 
 if [[ -f $SCA_REPORT ]]; then rm $SCA_REPORT; fi
 if [[ -f $UT_REPORT ]]; then rm $UT_REPORT; fi
@@ -70,7 +91,7 @@ function run_vet(){
 
     if [[ -f $SCA_REPORT ]]; then rm $SCA_REPORT; fi
     logger "INF" "Test is in progress... "
-    go vet -tags matrixone_test -unsafeptr=false ./pkg/... 2>&1 | tee $SCA_REPORT
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go vet -tags "${TAGS}" -unsafeptr=false ./pkg/... 2>&1 | tee $SCA_REPORT
     logger "INF" "Refer to $SCA_REPORT for details"
 
 }
@@ -95,18 +116,14 @@ function run_tests(){
     local cover_profile='profile.raw'
     make cgo
     make thirdparties
-    THIRDPARTIES_INSTALL_DIR=${BUILD_WKSP}/thirdparties/install
-
-    local CGO_CFLAGS="-I${BUILD_WKSP}/cgo -I${THIRDPARTIES_INSTALL_DIR}/include"
-    local CGO_LDFLAGS="-Wl,-rpath,${THIRDPARTIES_INSTALL_DIR}/lib -L${THIRDPARTIES_INSTALL_DIR}/lib -L${BUILD_WKSP}/cgo -lmo -lm"
 
     if [[ $SKIP_TESTS == 'race' ]]; then
         logger "INF" "Run UT without race check"
-	    CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m"  $test_scope > $UT_REPORT
+        LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags "${TAGS}" -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m"  $test_scope > $UT_REPORT
 
     else
         logger "INF" "Run UT with race check"
-        CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags matrixone_test -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT
+        LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" CGO_CFLAGS="${CGO_CFLAGS}" CGO_LDFLAGS="${CGO_LDFLAGS}" go test -short -v -json -tags "${TAGS}" -p ${UT_PARALLEL} -timeout "${UT_TIMEOUT}m" -race $test_scope > $UT_REPORT
     fi
 }
 
diff --git a/pkg/common/concurrent/asyncworkerpool.go b/pkg/common/concurrent/asyncworkerpool.go
new file mode 100644
index 0000000000000..844e3cd31a7a3
--- /dev/null
+++ b/pkg/common/concurrent/asyncworkerpool.go
@@ -0,0 +1,351 @@
+// Copyright 2024 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package concurrent
+
+import (
+	"os"
+	"os/signal"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/logutil"
+	"go.uber.org/zap"
+)
+
+// AsyncTask represents a task to be executed by the AsyncWorkerPool.
+type AsyncTask struct {
+	ID uint64
+	Fn func(res any) (any, error)
+}
+
+// AsyncTaskResult holds the result of a AsyncTask execution.
+type AsyncTaskResult struct {
+	ID     uint64
+	Result any
+	Error  error
+}
+
+// AsyncTaskResultStore manages the storage and retrieval of AsyncTaskResults.
+type AsyncTaskResultStore struct {
+	states    map[uint64]*taskState
+	mu        sync.Mutex
+	nextJobID uint64
+	stopCh    chan struct{}
+	stopped   atomic.Bool
+}
+
+type taskState struct {
+	done   chan struct{}
+	result *AsyncTaskResult
+}
+
+// NewAsyncTaskResultStore creates a new AsyncTaskResultStore.
+func NewAsyncTaskResultStore() *AsyncTaskResultStore {
+	return &AsyncTaskResultStore{
+		states:    make(map[uint64]*taskState),
+		nextJobID: 0,
+		stopCh:    make(chan struct{}),
+		stopped:   atomic.Bool{},
+	}
+}
+
+// Store saves a AsyncTaskResult in the store and signals any waiting goroutines.
+func (s *AsyncTaskResultStore) Store(result *AsyncTaskResult) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	state, ok := s.states[result.ID]
+	if !ok {
+		state = &taskState{done: make(chan struct{})}
+		s.states[result.ID] = state
+	}
+	state.result = result
+	close(state.done)
+}
+
+// Wait blocks until the result for the given jobID is available and returns it.
+// The result is removed from the internal map after being retrieved.
+func (s *AsyncTaskResultStore) Wait(jobID uint64) (*AsyncTaskResult, error) {
+	s.mu.Lock()
+	state, ok := s.states[jobID]
+	if !ok {
+		// If task was not submitted yet, create state and wait.
+		state = &taskState{done: make(chan struct{})}
+		s.states[jobID] = state
+		s.mu.Unlock() // Release lock before blocking
+	} else if state.result != nil {
+		// If result is already available, return it immediately without blocking.
+		delete(s.states, jobID) // Remove after retrieval
+		s.mu.Unlock()
+		return state.result, nil
+	} else {
+		// Task was submitted, but result not yet available. Release lock and wait.
+		s.mu.Unlock() // Release lock before blocking
+	}
+
+	select {
+	case <-state.done:
+		s.mu.Lock()
+		delete(s.states, jobID)
+		s.mu.Unlock()
+		return state.result, nil
+	case <-s.stopCh:
+		return nil, moerr.NewInternalErrorNoCtx("AsyncTaskResultStore stopped before result was available")
+	}
+}
+
+// GetNextJobID atomically increments and returns a new unique job ID.
+func (s *AsyncTaskResultStore) GetNextJobID() uint64 {
+	return atomic.AddUint64(&s.nextJobID, 1)
+}
+
+// Stop signals the AsyncTaskResultStore to stop processing new waits.
+func (s *AsyncTaskResultStore) Stop() {
+	if s.stopped.CompareAndSwap(false, true) {
+		close(s.stopCh)
+	}
+}
+
+// AsyncWorkerPool runs tasks in a dedicated OS thread with a CUDA context.
+type AsyncWorkerPool struct {
+	tasks                 chan *AsyncTask
+	stopCh                chan struct{}
+	wg                    sync.WaitGroup
+	stopped               atomic.Bool // Indicates if the worker has been stopped
+	firstError            atomic.Value
+	*AsyncTaskResultStore // Embed the result store
+	nthread               uint
+	sigc                  chan os.Signal // Add this field
+	errch                 chan error
+	createResource        func() (any, error)
+	cleanupResource       func(any)
+}
+
+// NewAsyncWorkerPool creates a new AsyncWorkerPool.
+func NewAsyncWorkerPool(nthread uint, createResource func() (any, error), cleanupResource func(any)) *AsyncWorkerPool {
+	return &AsyncWorkerPool{
+		tasks:                make(chan *AsyncTask, nthread),
+		stopCh:               make(chan struct{}),
+		stopped:              atomic.Bool{}, // Initialize to false
+		AsyncTaskResultStore: NewAsyncTaskResultStore(),
+		nthread:              nthread,
+		sigc:                 make(chan os.Signal, 1),   // Initialize sigc
+		errch:                make(chan error, nthread), // Initialize errch
+		createResource:       createResource,
+		cleanupResource:      cleanupResource,
+	}
+}
+
+// handleAndStoreTask processes a single AsyncTask and stores its result.
+func (w *AsyncWorkerPool) handleAndStoreTask(task *AsyncTask, resource any) {
+	result, err := task.Fn(resource)
+	asyncResult := &AsyncTaskResult{
+		ID:     task.ID,
+		Result: result,
+		Error:  err,
+	}
+	w.AsyncTaskResultStore.Store(asyncResult)
+}
+
+// drainAndProcessTasks drains the w.tasks channel and processes each task.
+// It stops when the channel is empty or closed.
+func (w *AsyncWorkerPool) drainAndProcessTasks(resource any) {
+	for {
+		select {
+		case task, ok := <-w.tasks:
+			if !ok {
+				return // Channel closed, no more tasks. Exit.
+			}
+			w.handleAndStoreTask(task, resource)
+		default:
+			return // All tasks drained, or channel is empty.
+		}
+	}
+}
+
+// Start begins the worker's execution loop.
+func (w *AsyncWorkerPool) Start(initFn func(res any) error, stopFn func(resource any) error) {
+	w.wg.Add(1) // for w.run
+	go w.run(initFn, stopFn)
+
+	signal.Notify(w.sigc, syscall.SIGTERM, syscall.SIGINT) // Notify signals to sigc
+
+	w.wg.Add(1) // for the signal handler goroutine
+	go func() {
+		defer w.wg.Done() // Ensure wg.Done() is called when this goroutine exits
+		select {
+		case <-w.sigc: // Wait for a signal
+			logutil.Info("AsyncWorkerPool received shutdown signal, stopping...")
+			if w.stopped.CompareAndSwap(false, true) {
+				close(w.stopCh) // Signal run() to stop.
+				close(w.tasks)  // Close tasks channel here.
+			}
+		case err := <-w.errch: // Listen for errors from worker goroutines
+			logutil.Error("AsyncWorkerPool received internal error, stopping...", zap.Error(err))
+			if w.firstError.Load() == nil {
+				w.firstError.Store(err)
+			}
+			if w.stopped.CompareAndSwap(false, true) {
+				close(w.stopCh) // Signal run() to stop.
+				close(w.tasks)  // Close tasks channel here.
+			}
+		case <-w.stopCh: // Listen for internal stop signal from w.Stop()
+			logutil.Info("AsyncWorkerPool signal handler received internal stop signal, exiting...")
+			// Do nothing, just exit. w.Stop() will handle the rest.
+		}
+	}()
+}
+
+// Stop signals the worker to terminate.
+func (w *AsyncWorkerPool) Stop() {
+	if w.stopped.CompareAndSwap(false, true) {
+		close(w.stopCh) // Signal run() to stop.
+		close(w.tasks)  // Close tasks channel here.
+	}
+	w.wg.Wait()
+	w.AsyncTaskResultStore.Stop() // Signal the result store to stop
+}
+
+// Submit sends a task to the worker.
+func (w *AsyncWorkerPool) Submit(fn func(res any) (any, error)) (uint64, error) {
+	if w.stopped.Load() {
+		return 0, moerr.NewInternalErrorNoCtx("cannot submit task: worker is stopped")
+	}
+	jobID := w.GetNextJobID()
+	task := &AsyncTask{
+		ID: jobID,
+		Fn: fn,
+	}
+	w.tasks <- task
+	return jobID, nil
+}
+
+func (w *AsyncWorkerPool) workerLoop(wg *sync.WaitGroup) {
+	defer wg.Done()
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	var resource any
+	var err error
+	if w.createResource != nil {
+		resource, err = w.createResource()
+		if err != nil {
+			w.errch <- err
+			return
+		}
+	}
+	if w.cleanupResource != nil {
+		defer w.cleanupResource(resource)
+	}
+
+	for {
+		select {
+		case task, ok := <-w.tasks:
+			if !ok { // tasks channel closed
+				return // No more tasks, and channel is closed. Exit.
+			}
+			w.handleAndStoreTask(task, resource) // Pass resource directly
+		case <-w.stopCh:
+			// stopCh signaled. Drain remaining tasks from w.tasks then exit.
+			w.drainAndProcessTasks(resource) // Pass resource directly
+			return
+		}
+	}
+}
+
+func (w *AsyncWorkerPool) run(initFn func(res any) error, stopFn func(resource any) error) {
+	defer w.wg.Done()
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	var parentResource any
+	var err error
+	if w.createResource != nil {
+		parentResource, err = w.createResource()
+		if err != nil {
+			w.errch <- err
+			return
+		}
+	}
+	if w.cleanupResource != nil {
+		defer w.cleanupResource(parentResource)
+	}
+
+	// Execute initFn once.
+	if initFn != nil {
+		if err := initFn(parentResource); err != nil {
+			logutil.Error("failed to initialize async resource with provided function", zap.Error(err))
+			w.errch <- err
+
+			return
+		}
+	}
+
+	if stopFn != nil {
+		defer func() {
+			if err := stopFn(parentResource); err != nil {
+				logutil.Error("error during async resource stop function", zap.Error(err))
+				w.errch <- err
+			}
+		}()
+	}
+
+	if w.nthread == 1 {
+		// Special case: nthread is 1, process tasks directly in this goroutine
+		for {
+			select {
+			case task, ok := <-w.tasks:
+				if !ok { // tasks channel closed
+					return // Channel closed, no more tasks. Exit.
+				}
+				w.handleAndStoreTask(task, parentResource)
+			case <-w.stopCh:
+				// Drain the tasks channel before exiting
+				w.drainAndProcessTasks(parentResource)
+				return
+			}
+		}
+	} else {
+		// General case: nthread > 1, create worker goroutines
+		var workerWg sync.WaitGroup
+		workerWg.Add(int(w.nthread))
+		for i := 0; i < int(w.nthread); i++ {
+			go w.workerLoop(&workerWg)
+		}
+
+		// Wait for stop signal
+		<-w.stopCh
+
+		// Signal workers to stop and wait for them to finish.
+		workerWg.Wait()
+	}
+}
+
+// Wait blocks until the result for the given jobID is available and returns it.
+// The result is removed from the internal map after being retrieved.
+func (w *AsyncWorkerPool) Wait(jobID uint64) (*AsyncTaskResult, error) {
+	return w.AsyncTaskResultStore.Wait(jobID)
+}
+
+// GetFirstError returns the first internal error encountered by the worker.
+func (w *AsyncWorkerPool) GetFirstError() error {
+	err := w.firstError.Load()
+	if err == nil {
+		return nil
+	}
+	return err.(error)
+}
diff --git a/pkg/common/concurrent/asyncworkerpool_test.go b/pkg/common/concurrent/asyncworkerpool_test.go
new file mode 100644
index 0000000000000..76c78314d17c3
--- /dev/null
+++ b/pkg/common/concurrent/asyncworkerpool_test.go
@@ -0,0 +1,509 @@
+// Copyright 2024 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package concurrent
+
+import (
+	"fmt"
+	"sync"
+	"syscall"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestNewAsyncTaskResultStore(t *testing.T) {
+	store := NewAsyncTaskResultStore()
+	assert.NotNil(t, store)
+	assert.NotNil(t, store.states)
+	assert.Equal(t, uint64(0), store.nextJobID)
+}
+
+func TestAsyncTaskResultStore_GetNextJobID(t *testing.T) {
+	store := NewAsyncTaskResultStore()
+	id1 := store.GetNextJobID()
+	id2 := store.GetNextJobID()
+	id3 := store.GetNextJobID()
+
+	assert.Equal(t, uint64(1), id1)
+	assert.Equal(t, uint64(2), id2)
+	assert.Equal(t, uint64(3), id3)
+}
+
+func TestAsyncTaskResultStore_StoreAndWait(t *testing.T) {
+	store := NewAsyncTaskResultStore()
+	jobID := store.GetNextJobID()
+	expectedResult := "task completed"
+
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		time.Sleep(10 * time.Millisecond) // Simulate some work before storing
+		store.Store(&AsyncTaskResult{
+			ID:     jobID,
+			Result: expectedResult,
+			Error:  nil,
+		})
+	}()
+
+	result, err := store.Wait(jobID)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, jobID, result.ID)
+	assert.Equal(t, expectedResult, result.Result)
+	assert.Nil(t, result.Error)
+
+	wg.Wait()
+
+	// Verify that the result is removed after retrieval
+	store.mu.Lock()
+	_, ok := store.states[jobID]
+	store.mu.Unlock()
+	assert.False(t, ok, "Result should be removed from store after Wait")
+}
+
+func TestAsyncTaskResultStore_ConcurrentStoreAndWait(t *testing.T) {
+	store := NewAsyncTaskResultStore()
+	numTasks := 100
+
+	var submitWg sync.WaitGroup
+	var waitWg sync.WaitGroup
+	submitWg.Add(numTasks)
+	waitWg.Add(numTasks)
+
+	results := make(chan *AsyncTaskResult, numTasks)
+
+	// Launch goroutines to wait for results
+	for i := 0; i < numTasks; i++ {
+		jobID := store.GetNextJobID() // Pre-generate job IDs
+		go func(id uint64) {
+			defer waitWg.Done()
+			result, err := store.Wait(id)
+			assert.NoError(t, err)
+			results <- result
+		}(jobID)
+	}
+
+	// Launch goroutines to store results
+	for i := 1; i <= numTasks; i++ {
+		go func(id uint64) {
+			defer submitWg.Done()
+			// Simulate random delay
+			time.Sleep(time.Duration(id%10) * time.Millisecond)
+			store.Store(&AsyncTaskResult{
+				ID:     id,
+				Result: fmt.Sprintf("result-%d", id),
+				Error:  nil,
+			})
+		}(uint64(i))
+	}
+
+	submitWg.Wait()
+	waitWg.Wait() // Ensure all waiters have completed
+	close(results)
+
+	receivedResults := make(map[uint64]string)
+	for r := range results {
+		receivedResults[r.ID] = r.Result.(string)
+	}
+
+	assert.Len(t, receivedResults, numTasks)
+	for i := 1; i <= numTasks; i++ {
+		assert.Equal(t, fmt.Sprintf("result-%d", i), receivedResults[uint64(i)])
+	}
+}
+
+type dummyResource struct {
+	closed bool
+}
+
+func (m *dummyResource) Close() {
+	m.closed = true
+}
+
+func testCreateResource() (any, error) {
+	return &dummyResource{}, nil
+}
+
+func testCleanupResource(res any) {
+	if res == nil {
+		return
+	}
+	resource := res.(*dummyResource)
+	resource.Close()
+}
+
+func TestAsyncWorkerPool_LifecycleAndTaskExecution(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource)
+	require.NotNil(t, worker)
+
+	// Start the worker
+	worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn
+
+	// Submit a task
+	expectedTaskResult := "processed by CUDA (mocked)"
+	taskID, err := worker.Submit(func(res any) (any, error) {
+		// In a real scenario, this would use the real resource
+		// For testing, we just return a value.
+		// Assert that res is not nil, even if it's a dummy one.
+		assert.NotNil(t, res)
+		return expectedTaskResult, nil
+	})
+	require.NoError(t, err)
+
+	// Wait for the result
+	result, err := worker.Wait(taskID)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, taskID, result.ID)
+	assert.Equal(t, expectedTaskResult, result.Result)
+	assert.Nil(t, result.Error)
+
+	// Submit another task
+	expectedTaskResult2 := 123
+	taskID2, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		return expectedTaskResult2, nil
+	})
+	require.NoError(t, err)
+
+	result2, err := worker.Wait(taskID2)
+	assert.NoError(t, err)
+	assert.NotNil(t, result2)
+	assert.Equal(t, taskID2, result2.ID)
+	assert.Equal(t, expectedTaskResult2, result2.Result)
+	assert.Nil(t, result2.Error)
+
+	// Test a task that returns an error
+	expectedError := fmt.Errorf("cuda operation failed")
+	taskID3, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		return nil, expectedError
+	})
+	require.NoError(t, err)
+
+	result3, err := worker.Wait(taskID3)
+	assert.NoError(t, err) // Error is returned in AsyncTaskResult, not as return value of Wait
+	assert.NotNil(t, result3)
+	assert.Equal(t, taskID3, result3.ID)
+	assert.Nil(t, result3.Result)
+	assert.Equal(t, expectedError, result3.Error)
+
+	// Stop the worker
+	worker.Stop()
+
+	t.Log("AsyncWorkerPool stopped. Further submissions would block or panic.")
+}
+
+func TestAsyncWorkerPool_StopDuringTaskProcessing(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource)
+	worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn
+
+	// Submit a long-running task
+	longTaskSignal := make(chan struct{})
+	longTaskID, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		<-longTaskSignal // Block until signaled
+		return "long task done", nil
+	})
+	require.NoError(t, err)
+
+	// Give the worker a moment to pick up the task
+	time.Sleep(50 * time.Millisecond)
+
+	// Stop the worker while the task is running
+	doneStopping := make(chan struct{})
+	go func() {
+		worker.Stop()
+		close(doneStopping)
+	}()
+
+	// Wait for a short period to see if Stop is blocked by the task
+	select {
+	case <-doneStopping:
+		t.Fatal("Worker stopped too quickly, long task might not have started blocking")
+	case <-time.After(100 * time.Millisecond):
+		// This means Stop is likely waiting for the `run` goroutine, which is blocked by the task.
+		t.Log("Worker.Stop is blocked by the long-running task as expected.")
+	}
+
+	// Now unblock the long-running task
+	close(longTaskSignal)
+
+	// The worker should now be able to stop
+	select {
+	case <-doneStopping:
+		t.Log("Worker successfully stopped after long task completed.")
+	case <-time.After(500 * time.Millisecond):
+		t.Fatal("Worker did not stop even after long task completed.")
+	}
+
+	// Verify that the long task result was stored
+	result, err := worker.Wait(longTaskID)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, longTaskID, result.ID)
+	assert.Equal(t, "long task done", result.Result)
+}
+
+func TestAsyncWorkerPool_MultipleSubmitsBeforeStart(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource)
+
+	// Start the worker - now takes initFn
+	worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn
+
+	// Submit multiple tasks before starting the worker
+	numTasks := 5
+	taskIDs := make([]uint64, numTasks) // Still need to collect IDs
+	for i := 0; i < numTasks; i++ {
+		var err error
+		taskIDs[i], err = worker.Submit(func(res any) (any, error) {
+			assert.NotNil(t, res)
+			return fmt.Sprintf("result-%d", i), nil
+		})
+		require.NoError(t, err)
+	}
+
+	// Start the worker
+	// worker.Start() // Already started above, remove duplicate
+
+	// Wait for all results
+	for i, id := range taskIDs {
+		result, err := worker.Wait(id)
+		assert.NoError(t, err)
+		assert.NotNil(t, result)
+		assert.Equal(t, id, result.ID)
+		assert.Equal(t, fmt.Sprintf("result-%d", i), result.Result)
+	}
+
+	worker.Stop()
+}
+
+func TestAsyncWorkerPool_GracefulShutdown(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(5, testCreateResource, testCleanupResource)
+	worker.Start(nil, func(_ any) error { return nil }) // Pass nil initFn
+
+	var wg sync.WaitGroup
+	numTasks := 10
+	results := make(chan *AsyncTaskResult, numTasks) // Changed type
+
+	// Submit tasks
+	for i := 0; i < numTasks; i++ {
+		wg.Add(1)
+		// Capture loop index for the anonymous function
+		loopIndex := i
+
+		var submitErr error
+		taskID, submitErr := worker.Submit(func(res any) (any, error) {
+			assert.NotNil(t, res)
+			time.Sleep(10 * time.Millisecond)                     // Simulate work
+			return fmt.Sprintf("final-result-%d", loopIndex), nil // Use captured loop index
+		})
+		require.NoError(t, submitErr)
+
+		go func(id uint64) {
+			defer wg.Done()
+			r, waitErr := worker.Wait(id)
+			assert.NoError(t, waitErr)
+			results <- r
+		}(taskID)
+	}
+
+	// Give some time for tasks to be submitted and processed
+	time.Sleep(50 * time.Millisecond)
+
+	// Stop the worker
+	worker.Stop()
+
+	// All tasks submitted before Stop should complete and their results should be retrievable
+	wg.Wait()
+	close(results)
+
+	assert.Len(t, results, numTasks)
+	for r := range results {
+		assert.Contains(t, r.Result.(string), "final-result-")
+	}
+
+	// Ensure new tasks cannot be submitted after stop
+	_, err := worker.Submit(func(res any) (any, error) { // Use := for first declaration of err in this scope
+		return "should not be processed", nil
+	})
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "worker is stopped")
+}
+
+func TestAsyncWorkerPool_SignalTermination(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource) // Use 1 thread for easier control and observation
+	require.NotNil(t, worker)
+
+	worker.Start(nil, func(_ any) error { return nil })
+
+	// Submit a task that will complete after the signal, to ensure graceful processing
+	taskDone := make(chan struct{})
+	taskID1, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		<-taskDone // Wait for signal to complete
+		return "task1 processed", nil
+	})
+	require.NoError(t, err)
+
+	// Submit a second quick task that should complete before or around the signal
+	taskID2, err := worker.Submit(func(res any) (any, error) {
+		assert.NotNil(t, res)
+		return "task2 processed", nil
+	})
+	require.NoError(t, err)
+
+	// Give the worker a moment to pick up the tasks
+	time.Sleep(50 * time.Millisecond)
+
+	// Simulate SIGTERM by sending to the signal channel
+	t.Log("Simulating SIGTERM to AsyncWorkerPool")
+	worker.sigc <- syscall.SIGTERM
+
+	// Allow some time for the signal handler to process and call worker.Stop()
+	time.Sleep(100 * time.Millisecond)
+
+	// Unblock the long-running task to allow it to finish and the worker to fully stop
+	close(taskDone)
+
+	// Wait for all worker goroutines to finish
+	// The worker.Stop() method, which is called by the signal handler,
+	// internally waits for worker.wg.Wait().
+	// So, we can verify by checking if new submissions fail and if old tasks results are available.
+
+	// Check if previously submitted tasks completed
+	result1, err := worker.Wait(taskID1)
+	assert.NoError(t, err)
+	assert.NotNil(t, result1)
+	assert.Equal(t, taskID1, result1.ID)
+	assert.Equal(t, "task1 processed", result1.Result)
+
+	result2, err := worker.Wait(taskID2)
+	assert.NoError(t, err)
+	assert.NotNil(t, result2)
+	assert.Equal(t, taskID2, result2.ID)
+	assert.Equal(t, "task2 processed", result2.Result)
+
+	// Attempt to submit a new task after termination. It should fail.
+	_, err = worker.Submit(func(res any) (any, error) {
+		return "should not be processed", nil
+	})
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "worker is stopped")
+}
+
+func TestAsyncWorkerPool_GetFirstError(t *testing.T) {
+
+	var err error // Explicitly declare err here
+
+	worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource)
+	assert.Nil(t, worker.GetFirstError(), "GetFirstError should be nil initially")
+
+	// Trigger an error in initFn, which will be pushed to w.errch
+	expectedErr1 := fmt.Errorf("simulated init error 1")
+	initFn1 := func(resource any) error {
+		return expectedErr1
+	}
+	stopFn := func(_ any) error { return nil }
+
+	worker.Start(initFn1, stopFn)
+
+	// Give the `run` goroutine and the signal handler a moment to process initFn and store the first error.
+	time.Sleep(50 * time.Millisecond)
+
+	// GetFirstError should now return the expected error
+	assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should return the first recorded error")
+
+	// Submit a task that causes an error (this error won't be saved as firstError via w.errch)
+	// This ensures that only errors propagated through w.errch are considered.
+	_, err = worker.Submit(func(res any) (any, error) { // Use = for assignment
+		assert.NotNil(t, res)
+		return nil, fmt.Errorf("task error, should not affect GetFirstError()")
+	})
+	require.Error(t, err) // Expect an error because the worker should be stopped
+	assert.Contains(t, err.Error(), "worker is stopped")
+
+	// Give some time for the task to be processed, if it affects anything
+	time.Sleep(50 * time.Millisecond)
+
+	// Ensure GetFirstError remains the same even if other errors (from tasks) occur.
+	assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should not change after the first error is set")
+
+	worker.Stop()
+
+	// After stop, GetFirstError should still be the same.
+	assert.Equal(t, expectedErr1, worker.GetFirstError(), "GetFirstError should retain the first error after stopping")
+}
+
+func TestAsyncWorkerPool_MultipleStopCalls(t *testing.T) {
+
+	worker := NewAsyncWorkerPool(1, testCreateResource, testCleanupResource) // Use 1 thread
+	require.NotNil(t, worker)
+
+	worker.Start(nil, func(_ any) error { return nil })
+
+	// Call Stop multiple times from the main goroutine
+	worker.Stop()
+	worker.Stop()
+	worker.Stop()
+
+	// Call Stop from another goroutine
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		worker.Stop()
+	}()
+	wg.Wait()
+
+	// Ensure no panics occurred during multiple Stop calls
+	// (Go's testing framework will catch panics)
+
+	// Optionally, try submitting a task again to ensure it's truly stopped
+	_, err := worker.Submit(func(res any) (any, error) { return nil, nil })
+	assert.Error(t, err)
+	assert.Contains(t, err.Error(), "worker is stopped")
+
+	t.Log("Successfully called Stop multiple times without panic.")
+}
+
+func TestAsyncWorkerPool_NilCallbacks(t *testing.T) {
+	worker := NewAsyncWorkerPool(2, nil, nil)
+	require.NotNil(t, worker)
+
+	worker.Start(nil, nil)
+
+	expectedResult := "no resource needed"
+	taskID, err := worker.Submit(func(res any) (any, error) {
+		assert.Nil(t, res)
+		return expectedResult, nil
+	})
+	require.NoError(t, err)
+
+	result, err := worker.Wait(taskID)
+	assert.NoError(t, err)
+	assert.NotNil(t, result)
+	assert.Equal(t, expectedResult, result.Result)
+
+	worker.Stop()
+}
diff --git a/pkg/common/concurrent/executor.go b/pkg/common/concurrent/executor.go
index 1cc21cf82cdaf..0eac95c6f5a4c 100644
--- a/pkg/common/concurrent/executor.go
+++ b/pkg/common/concurrent/executor.go
@@ -37,6 +37,14 @@ func (e ThreadPoolExecutor) Execute(
 	nitems int,
 	fn func(ctx context.Context, thread_id int, start, end int) error) (err error) {
 
+	if nitems <= 0 {
+		return nil
+	}
+
+	if e.nthreads <= 1 {
+		return fn(ctx, 0, 0, nitems)
+	}
+
 	g, ctx := errgroup.WithContext(ctx)
 
 	q := nitems / e.nthreads
diff --git a/pkg/common/concurrent/executor_test.go b/pkg/common/concurrent/executor_test.go
index 61f4856f15e88..50ef97b2df16e 100644
--- a/pkg/common/concurrent/executor_test.go
+++ b/pkg/common/concurrent/executor_test.go
@@ -87,3 +87,40 @@ func TestExecutorDistribution(t *testing.T) {
 
 	require.Equal(t, 9, count)
 }
+
+func TestExecutorSingleThread(t *testing.T) {
+	ctx := context.Background()
+	nitems := 10
+	nthreads := 1
+
+	e := NewThreadPoolExecutor(nthreads)
+
+	called := false
+	err := e.Execute(ctx, nitems, func(ctx context.Context, thread_id int, start, end int) error {
+		called = true
+		require.Equal(t, 0, thread_id)
+		require.Equal(t, 0, start)
+		require.Equal(t, nitems, end)
+		return nil
+	})
+
+	require.NoError(t, err)
+	require.True(t, called)
+}
+
+func TestExecutorZeroItems(t *testing.T) {
+	ctx := context.Background()
+	nitems := 0
+	nthreads := 4
+
+	e := NewThreadPoolExecutor(nthreads)
+
+	called := false
+	err := e.Execute(ctx, nitems, func(ctx context.Context, thread_id int, start, end int) error {
+		called = true
+		return nil
+	})
+
+	require.NoError(t, err)
+	require.False(t, called)
+}
diff --git a/pkg/common/util/unsafe.go b/pkg/common/util/unsafe.go
index 9cf7cea2ca92d..d060ba7df301a 100644
--- a/pkg/common/util/unsafe.go
+++ b/pkg/common/util/unsafe.go
@@ -110,3 +110,8 @@ func UnsafeUintptr[P *T, T any](p P) uintptr {
 func UnsafePointer[P *T, T any](p P) unsafe.Pointer {
 	return unsafe.Pointer(p)
 }
+
+func UnsafeSizeOf[T any]() uintptr {
+	var zero T
+	return unsafe.Sizeof(zero)
+}
diff --git a/pkg/cuvs/brute_force.go b/pkg/cuvs/brute_force.go
new file mode 100644
index 0000000000000..b89747ad4631e
--- /dev/null
+++ b/pkg/cuvs/brute_force.go
@@ -0,0 +1,140 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/brute_force_c.h"
+#include <stdlib.h>
+*/
+import "C"
+import (
+    "runtime"
+    "unsafe"
+    "github.com/matrixorigin/matrixone/pkg/common/moerr"
+)
+
+// GpuBruteForce represents the C++ gpu_brute_force_t object
+type GpuBruteForce[T VectorType] struct {
+    cIndex C.gpu_brute_force_c
+}
+
+// NewGpuBruteForce creates a new GpuBruteForce instance
+func NewGpuBruteForce[T VectorType](dataset []T, count_vectors uint64, dimension uint32, metric DistanceType, nthread uint32, device_id int) (*GpuBruteForce[T], error) {
+    if len(dataset) == 0 || count_vectors == 0 || dimension == 0 {
+        return nil, moerr.NewInternalErrorNoCtx("dataset, count_vectors, and dimension cannot be zero")
+    }
+
+    qtype := GetQuantization[T]()
+    var errmsg *C.char
+    cIndex := C.gpu_brute_force_new(
+        unsafe.Pointer(&dataset[0]),
+        C.uint64_t(count_vectors),
+        C.uint32_t(dimension),
+        C.distance_type_t(metric),
+        C.uint32_t(nthread),
+        C.int(device_id),
+        C.quantization_t(qtype),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(dataset)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if cIndex == nil {
+        return nil, moerr.NewInternalErrorNoCtx("failed to create GpuBruteForce")
+    }
+    return &GpuBruteForce[T]{cIndex: cIndex}, nil
+}
+
+// Load loads the index to the GPU
+func (gbi *GpuBruteForce[T]) Load() error {
+    if gbi.cIndex == nil {
+        return moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+    }
+    var errmsg *C.char
+    C.gpu_brute_force_load(gbi.cIndex, unsafe.Pointer(&errmsg))
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Search performs a search operation
+func (gbi *GpuBruteForce[T]) Search(queries []T, num_queries uint64, query_dimension uint32, limit uint32) ([]int64, []float32, error) {
+	if gbi.cIndex == nil {
+		return nil, nil, moerr.NewInternalErrorNoCtx("GpuBruteForce is not initialized")
+	}
+	if len(queries) == 0 || num_queries == 0 || query_dimension == 0 {
+		return nil, nil, moerr.NewInternalErrorNoCtx("queries, num_queries, and query_dimension cannot be zero")
+	}
+
+	var errmsg *C.char
+	cResult := C.gpu_brute_force_search(
+		gbi.cIndex,
+		unsafe.Pointer(&queries[0]),
+		C.uint64_t(num_queries),
+		C.uint32_t(query_dimension),
+		C.uint32_t(limit),
+		unsafe.Pointer(&errmsg),
+	)
+    runtime.KeepAlive(queries)
+
+	if errmsg != nil {
+		errStr := C.GoString(errmsg)
+		C.free(unsafe.Pointer(errmsg))
+		return nil, nil, moerr.NewInternalErrorNoCtx(errStr)
+	}
+	if cResult == nil {
+		return nil, nil, moerr.NewInternalErrorNoCtx("search returned nil result")
+	}
+
+	// Allocate slices for results
+	neighbors := make([]int64, num_queries*uint64(limit))
+	distances := make([]float32, num_queries*uint64(limit))
+
+	C.gpu_brute_force_get_results(cResult, C.uint64_t(num_queries), C.uint32_t(limit), (*C.int64_t)(unsafe.Pointer(&neighbors[0])), (*C.float)(unsafe.Pointer(&distances[0])))
+    runtime.KeepAlive(neighbors)
+    runtime.KeepAlive(distances)
+
+	C.gpu_brute_force_free_search_result(cResult);
+
+	return neighbors, distances, nil
+}
+
+// Destroy frees the C++ GpuBruteForce instance
+func (gbi *GpuBruteForce[T]) Destroy() error {
+    if gbi.cIndex == nil {
+        return nil
+    }
+    var errmsg *C.char
+    C.gpu_brute_force_destroy(gbi.cIndex, unsafe.Pointer(&errmsg))
+    gbi.cIndex = nil // Mark as destroyed
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
diff --git a/pkg/cuvs/brute_force_test.go b/pkg/cuvs/brute_force_test.go
new file mode 100644
index 0000000000000..9a3351bac4864
--- /dev/null
+++ b/pkg/cuvs/brute_force_test.go
@@ -0,0 +1,102 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+import (
+    "testing"
+    "fmt"
+)
+
+func TestNewGpuBruteForce(t *testing.T) {
+    dimension := uint32(3)
+    count := uint64(2)
+    dataset := []float32{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}
+    
+    // Test with float32
+    index, err := NewGpuBruteForce(dataset, count, dimension, L2Expanded, 1, 0)
+    if err != nil {
+        t.Fatalf("Failed to create GpuBruteForce: %v", err)
+    }
+
+    err = index.Load()
+    if err != nil {
+        t.Fatalf("Failed to load: %v", err)
+    }
+
+    queries := []float32{1.0, 2.0, 3.0}
+    neighbors, distances, err := index.Search(queries, 1, dimension, 1)
+    if err != nil {
+        t.Fatalf("Failed to search: %v", err)
+    }
+
+    fmt.Printf("Search Result: Neighbors=%v, Distances=%v\n", neighbors, distances)
+
+    if neighbors[0] != 0 {
+        t.Errorf("Expected first neighbor to be 0, got %d", neighbors[0])
+    }
+    if distances[0] != 0.0 {
+        t.Errorf("Expected first distance to be 0.0, got %f", distances[0])
+    }
+
+    err = index.Destroy()
+    if err != nil {
+        t.Fatalf("Failed to destroy: %v", err)
+    }
+}
+
+func TestGpuBruteForceFloat16(t *testing.T) {
+    dimension := uint32(2)
+    count := uint64(2)
+    dataset := []float32{1.0, 1.0, 2.0, 2.0}
+    
+    // Convert to Float16 on GPU
+    hDataset := make([]Float16, len(dataset))
+    err := GpuConvertF32ToF16(dataset, hDataset, 0)
+    if err != nil {
+        t.Fatalf("Failed to convert dataset to F16: %v", err)
+    }
+
+    index, err := NewGpuBruteForce(hDataset, count, dimension, L2Expanded, 1, 0)
+    if err != nil {
+        t.Fatalf("Failed to create F16 GpuBruteForce: %v", err)
+    }
+
+    err = index.Load()
+    if err != nil {
+        t.Fatalf("Failed to load: %v", err)
+    }
+
+    queries := []float32{1.0, 1.0}
+    hQueries := make([]Float16, len(queries))
+    GpuConvertF32ToF16(queries, hQueries, 0)
+
+    neighbors, distances, err := index.Search(hQueries, 1, dimension, 1)
+    if err != nil {
+        t.Fatalf("Failed to search F16: %v", err)
+    }
+
+    if neighbors[0] != 0 {
+        t.Errorf("Expected first neighbor 0, got %d", neighbors[0])
+    }
+    if distances[0] != 0.0 {
+        t.Errorf("Expected distance 0.0, got %f", distances[0])
+    }
+
+    index.Destroy()
+}
diff --git a/pkg/cuvs/cagra.go b/pkg/cuvs/cagra.go
new file mode 100644
index 0000000000000..68cfebdfdb1af
--- /dev/null
+++ b/pkg/cuvs/cagra.go
@@ -0,0 +1,314 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/cagra_c.h"
+#include <stdlib.h>
+#include <stdbool.h>
+*/
+import "C"
+import (
+    "runtime"
+    "unsafe"
+    "github.com/matrixorigin/matrixone/pkg/common/moerr"
+)
+
+// GpuCagra represents the C++ gpu_cagra_t object.
+type GpuCagra[T VectorType] struct {
+    cCagra    C.gpu_cagra_c
+    dimension uint32
+}
+
+// NewGpuCagra creates a new GpuCagra instance from a dataset.
+func NewGpuCagra[T VectorType](dataset []T, count uint64, dimension uint32, metric DistanceType, 
+                               bp CagraBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuCagra[T], error) {
+    if len(devices) == 0 {
+        return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+    }
+
+    qtype := GetQuantization[T]()
+    var errmsg *C.char
+    cDevices := make([]C.int, len(devices))
+    for i, d := range devices {
+        cDevices[i] = C.int(d)
+    }
+
+    cBP := C.cagra_build_params_t{
+        intermediate_graph_degree: C.size_t(bp.IntermediateGraphDegree),
+        graph_degree:              C.size_t(bp.GraphDegree),
+        attach_dataset_on_build:   C.bool(bp.AttachDatasetOnBuild),
+    }
+
+    cCagra := C.gpu_cagra_new(
+        unsafe.Pointer(&dataset[0]),
+        C.uint64_t(count),
+        C.uint32_t(dimension),
+        C.distance_type_t(metric),
+        cBP,
+        &cDevices[0],
+        C.int(len(devices)),
+        C.uint32_t(nthread),
+        C.distribution_mode_t(mode),
+        C.quantization_t(qtype),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(dataset)
+    runtime.KeepAlive(cDevices)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if cCagra == nil {
+        return nil, moerr.NewInternalErrorNoCtx("failed to create GpuCagra")
+    }
+
+    return &GpuCagra[T]{cCagra: cCagra, dimension: dimension}, nil
+}
+
+// NewGpuCagraFromFile creates a new GpuCagra instance by loading from a file.
+func NewGpuCagraFromFile[T VectorType](filename string, dimension uint32, metric DistanceType, 
+                                       bp CagraBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuCagra[T], error) {
+    if len(devices) == 0 {
+        return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+    }
+
+    qtype := GetQuantization[T]()
+    var errmsg *C.char
+    cFilename := C.CString(filename)
+    defer C.free(unsafe.Pointer(cFilename))
+
+    cDevices := make([]C.int, len(devices))
+    for i, d := range devices {
+        cDevices[i] = C.int(d)
+    }
+
+    cBP := C.cagra_build_params_t{
+        intermediate_graph_degree: C.size_t(bp.IntermediateGraphDegree),
+        graph_degree:              C.size_t(bp.GraphDegree),
+        attach_dataset_on_build:   C.bool(bp.AttachDatasetOnBuild),
+    }
+
+    cCagra := C.gpu_cagra_load_file(
+        cFilename,
+        C.uint32_t(dimension),
+        C.distance_type_t(metric),
+        cBP,
+        &cDevices[0],
+        C.int(len(devices)),
+        C.uint32_t(nthread),
+        C.distribution_mode_t(mode),
+        C.quantization_t(qtype),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(cDevices)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if cCagra == nil {
+        return nil, moerr.NewInternalErrorNoCtx("failed to load GpuCagra from file")
+    }
+
+    return &GpuCagra[T]{cCagra: cCagra, dimension: dimension}, nil
+}
+
+// Destroy frees the C++ gpu_cagra_t instance
+func (gc *GpuCagra[T]) Destroy() error {
+    if gc.cCagra == nil {
+        return nil
+    }
+    var errmsg *C.char
+    C.gpu_cagra_destroy(gc.cCagra, unsafe.Pointer(&errmsg))
+    gc.cCagra = nil
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Load triggers the build or file loading process
+func (gc *GpuCagra[T]) Load() error {
+    if gc.cCagra == nil {
+        return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+    }
+    var errmsg *C.char
+    C.gpu_cagra_load(gc.cCagra, unsafe.Pointer(&errmsg))
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Save serializes the index to a file
+func (gc *GpuCagra[T]) Save(filename string) error {
+    if gc.cCagra == nil {
+        return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+    }
+    var errmsg *C.char
+    cFilename := C.CString(filename)
+    defer C.free(unsafe.Pointer(cFilename))
+
+    C.gpu_cagra_save(gc.cCagra, cFilename, unsafe.Pointer(&errmsg))
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Search performs a K-Nearest Neighbor search
+func (gc *GpuCagra[T]) Search(queries []T, numQueries uint64, dimension uint32, limit uint32, sp CagraSearchParams) (SearchResult, error) {
+    if gc.cCagra == nil {
+        return SearchResult{}, moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+    }
+    if len(queries) == 0 || numQueries == 0 {
+        return SearchResult{}, nil
+    }
+
+    var errmsg *C.char
+    cSP := C.cagra_search_params_t{
+        itopk_size:   C.size_t(sp.ItopkSize),
+        search_width: C.size_t(sp.SearchWidth),
+    }
+
+    res := C.gpu_cagra_search(
+        gc.cCagra,
+        unsafe.Pointer(&queries[0]),
+        C.uint64_t(numQueries),
+        C.uint32_t(dimension),
+        C.uint32_t(limit),
+        cSP,
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(queries)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return SearchResult{}, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if res.result_ptr == nil {
+        return SearchResult{}, moerr.NewInternalErrorNoCtx("search returned nil result")
+    }
+
+    totalElements := uint64(numQueries) * uint64(limit)
+    neighbors := make([]uint32, totalElements)
+    distances := make([]float32, totalElements)
+
+    C.gpu_cagra_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.uint32_t)(unsafe.Pointer(&neighbors[0])))
+    C.gpu_cagra_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0])))
+    runtime.KeepAlive(neighbors)
+    runtime.KeepAlive(distances)
+
+    C.gpu_cagra_free_result(res.result_ptr)
+
+    return SearchResult{
+        Neighbors: neighbors,
+        Distances: distances,
+    }, nil
+}
+
+// Extend adds more vectors to the index (single-GPU only)
+func (gc *GpuCagra[T]) Extend(additionalData []T, numVectors uint64) error {
+    if gc.cCagra == nil {
+        return moerr.NewInternalErrorNoCtx("GpuCagra is not initialized")
+    }
+    if len(additionalData) == 0 || numVectors == 0 {
+        return nil
+    }
+
+    var errmsg *C.char
+    C.gpu_cagra_extend(
+        gc.cCagra,
+        unsafe.Pointer(&additionalData[0]),
+        C.uint64_t(numVectors),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(additionalData)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Merge combines multiple single-GPU GpuCagra indices into a new one.
+func MergeGpuCagra[T VectorType](indices []*GpuCagra[T], nthread uint32, devices []int) (*GpuCagra[T], error) {
+    if len(indices) == 0 {
+        return nil, moerr.NewInternalErrorNoCtx("no indices to merge")
+    }
+    if len(devices) == 0 {
+        return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+    }
+
+    cIndices := make([]C.gpu_cagra_c, len(indices))
+    for i, idx := range indices {
+        cIndices[i] = idx.cCagra
+    }
+
+    cDevices := make([]C.int, len(devices))
+    for i, d := range devices {
+        cDevices[i] = C.int(d)
+    }
+
+    var errmsg *C.char
+    cCagra := C.gpu_cagra_merge(
+        &cIndices[0],
+        C.int(len(indices)),
+        C.uint32_t(nthread),
+        &cDevices[0],
+        C.int(len(devices)),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(cIndices)
+    runtime.KeepAlive(cDevices)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if cCagra == nil {
+        return nil, moerr.NewInternalErrorNoCtx("failed to merge GpuCagra indices")
+    }
+
+    return &GpuCagra[T]{cCagra: cCagra, dimension: indices[0].dimension}, nil
+}
+
+// SearchResult contains the neighbors and distances from a search.
+type SearchResult struct {
+	Neighbors []uint32
+	Distances []float32
+}
diff --git a/pkg/cuvs/cagra_test.go b/pkg/cuvs/cagra_test.go
new file mode 100644
index 0000000000000..538a2fc6b8a8f
--- /dev/null
+++ b/pkg/cuvs/cagra_test.go
@@ -0,0 +1,232 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+import (
+    "os"
+    "testing"
+)
+
+func TestGpuCagra(t *testing.T) {
+    dimension := uint32(16)
+    count := uint64(100)
+    dataset := make([]float32, count*uint64(dimension))
+    for i := range dataset {
+        dataset[i] = float32(i)
+    }
+
+    devices := []int{0}
+    bp := DefaultCagraBuildParams()
+    index, err := NewGpuCagra[float32](dataset, count, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    if err != nil {
+        t.Fatalf("Failed to create GpuCagra: %v", err)
+    }
+    defer index.Destroy()
+
+    err = index.Load()
+    if err != nil {
+        t.Fatalf("Failed to load/build GpuCagra: %v", err)
+    }
+
+    queries := make([]float32, dimension)
+    for i := range queries {
+        queries[i] = 0.0
+    }
+
+    sp := DefaultCagraSearchParams()
+    result, err := index.Search(queries, 1, dimension, 5, sp)
+    if err != nil {
+        t.Fatalf("Search failed: %v", err)
+    }
+
+    t.Logf("CAGRA Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+    if len(result.Neighbors) != 5 {
+        t.Errorf("Expected 5 neighbors, got %d", len(result.Neighbors))
+    }
+    if result.Neighbors[0] != 0 {
+        t.Errorf("Expected nearest neighbor to be 0, got %d", result.Neighbors[0])
+    }
+}
+
+func TestGpuCagraSaveLoad(t *testing.T) {
+    dimension := uint32(16)
+    count := uint64(100)
+    dataset := make([]float32, count*uint64(dimension))
+    for i := range dataset {
+        dataset[i] = float32(i)
+    }
+
+    devices := []int{0}
+    bp := DefaultCagraBuildParams()
+    index, err := NewGpuCagra[float32](dataset, count, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    if err != nil {
+        t.Fatalf("Failed to create GpuCagra: %v", err)
+    }
+    err = index.Load()
+    if err != nil {
+        t.Fatalf("Load failed: %v", err)
+    }
+
+    filename := "test_cagra.idx"
+    err = index.Save(filename)
+    if err != nil {
+        t.Fatalf("Save failed: %v", err)
+    }
+    defer os.Remove(filename)
+    index.Destroy()
+
+    index2, err := NewGpuCagraFromFile[float32](filename, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    if err != nil {
+        t.Fatalf("Failed to create GpuCagra from file: %v", err)
+    }
+    defer index2.Destroy()
+
+    err = index2.Load()
+    if err != nil {
+        t.Fatalf("Load from file failed: %v", err)
+    }
+
+    queries := make([]float32, dimension)
+    sp := DefaultCagraSearchParams()
+    result, err := index2.Search(queries, 1, dimension, 1, sp)
+    if err != nil {
+        t.Fatalf("Search failed: %v", err)
+    }
+    if result.Neighbors[0] != 0 {
+        t.Errorf("Expected 0, got %d", result.Neighbors[0])
+    }
+}
+
+func TestGpuCagraExtend(t *testing.T) {
+    dimension := uint32(16)
+    count := uint64(100)
+    dataset := make([]float32, count*uint64(dimension))
+    for i := range dataset {
+        dataset[i] = float32(i)
+    }
+
+    devices := []int{0}
+    bp := DefaultCagraBuildParams()
+    index, err := NewGpuCagra[float32](dataset, count, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    if err != nil {
+        t.Fatalf("Failed to create GpuCagra: %v", err)
+    }
+    defer index.Destroy()
+    index.Load()
+
+    extra := make([]float32, 10*dimension)
+    for i := range extra {
+        extra[i] = 1000.0
+    }
+    err = index.Extend(extra, 10)
+    if err != nil {
+        t.Fatalf("Extend failed: %v", err)
+    }
+
+    queries := make([]float32, dimension)
+    for i := range queries {
+        queries[i] = 1000.0
+    }
+    sp := DefaultCagraSearchParams()
+    result, err := index.Search(queries, 1, dimension, 1, sp)
+    if err != nil {
+        t.Fatalf("Search failed: %v", err)
+    }
+    if result.Neighbors[0] < 100 {
+        t.Errorf("Expected neighbor from extended data, got %d", result.Neighbors[0])
+    }
+}
+
+func TestGpuCagraMerge(t *testing.T) {
+    dimension := uint32(16)
+    count := uint64(200)
+    
+    // Cluster 1: values around 0
+    ds1 := make([]float32, count*uint64(dimension))
+    for i := range ds1 { ds1[i] = float32(i % 10) }
+    // Cluster 2: values around 1000
+    ds2 := make([]float32, count*uint64(dimension))
+    for i := range ds2 { ds2[i] = float32(1000 + (i % 10)) }
+
+    devices := []int{0}
+    bp := DefaultCagraBuildParams()
+    bp.IntermediateGraphDegree = 64
+    bp.GraphDegree = 32
+
+    idx1, _ := NewGpuCagra[float32](ds1, count, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    idx2, _ := NewGpuCagra[float32](ds2, count, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    idx1.Load()
+    idx2.Load()
+    defer idx1.Destroy()
+    defer idx2.Destroy()
+
+    merged, err := MergeGpuCagra([]*GpuCagra[float32]{idx1, idx2}, 1, devices)
+    if err != nil {
+        t.Fatalf("Merge failed: %v", err)
+    }
+    defer merged.Destroy()
+
+    // Query near Cluster 2
+    queries := make([]float32, dimension)
+    for i := range queries { queries[i] = 1000.0 }
+    sp := DefaultCagraSearchParams()
+    result, err := merged.Search(queries, 1, dimension, 1, sp)
+    if err != nil {
+        t.Fatalf("Search failed: %v", err)
+    }
+    // Result should be from second index (index >= 200)
+    if result.Neighbors[0] < 200 {
+        t.Errorf("Expected neighbor from second index (>=200), got %d", result.Neighbors[0])
+    }
+}
+
+func TestGpuShardedCagra(t *testing.T) {
+    count, _ := GetGpuDeviceCount()
+    if count < 1 {
+        t.Skip("Need at least 1 GPU for sharded CAGRA test")
+    }
+    
+    devices := []int{0} 
+    dimension := uint32(16)
+    n_vectors := uint64(100)
+    dataset := make([]float32, n_vectors*uint64(dimension))
+    for i := range dataset { dataset[i] = float32(i) }
+
+    bp := DefaultCagraBuildParams()
+    index, err := NewGpuCagra[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Sharded)
+    if err != nil {
+        t.Fatalf("Failed to create sharded CAGRA: %v", err)
+    }
+    defer index.Destroy()
+
+    err = index.Load()
+    if err != nil {
+        t.Fatalf("Load sharded failed: %v", err)
+    }
+
+    queries := make([]float32, dimension)
+    sp := DefaultCagraSearchParams()
+    result, err := index.Search(queries, 1, dimension, 5, sp)
+    if err != nil {
+        t.Fatalf("Search sharded failed: %v", err)
+    }
+    if len(result.Neighbors) != 5 {
+        t.Errorf("Expected 5 neighbors, got %d", len(result.Neighbors))
+    }
+}
diff --git a/pkg/cuvs/helper.go b/pkg/cuvs/helper.go
new file mode 100644
index 0000000000000..50533098ecdb5
--- /dev/null
+++ b/pkg/cuvs/helper.go
@@ -0,0 +1,221 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/helper.h"
+#include <stdlib.h>
+*/
+import "C"
+import (
+    "unsafe"
+    "runtime"
+    "github.com/matrixorigin/matrixone/pkg/common/moerr"
+)
+
+// DistanceType maps to C.distance_type_t
+type DistanceType C.distance_type_t
+
+const (
+	L2Expanded          DistanceType = C.DistanceType_L2Expanded
+	L2SqrtExpanded      DistanceType = C.DistanceType_L2SqrtExpanded
+	CosineExpanded      DistanceType = C.DistanceType_CosineExpanded
+	L1                  DistanceType = C.DistanceType_L1
+	L2Unexpanded        DistanceType = C.DistanceType_L2Unexpanded
+	L2SqrtUnexpanded    DistanceType = C.DistanceType_L2SqrtUnexpanded
+	InnerProduct        DistanceType = C.DistanceType_InnerProduct
+	Linf                DistanceType = C.DistanceType_Linf
+	Canberra            DistanceType = C.DistanceType_Canberra
+	LpUnexpanded        DistanceType = C.DistanceType_LpUnexpanded
+	CorrelationExpanded DistanceType = C.DistanceType_CorrelationExpanded
+	JaccardExpanded     DistanceType = C.DistanceType_JaccardExpanded
+	HellingerExpanded   DistanceType = C.DistanceType_HellingerExpanded
+	Haversine           DistanceType = C.DistanceType_Haversine
+	BrayCurtis          DistanceType = C.DistanceType_BrayCurtis
+	JensenShannon       DistanceType = C.DistanceType_JensenShannon
+	HammingUnexpanded   DistanceType = C.DistanceType_HammingUnexpanded
+	KLDivergence        DistanceType = C.DistanceType_KLDivergence
+	RusselRaoExpanded   DistanceType = C.DistanceType_RusselRaoExpanded
+	DiceExpanded        DistanceType = C.DistanceType_DiceExpanded
+	BitwiseHamming      DistanceType = C.DistanceType_BitwiseHamming
+	Precomputed         DistanceType = C.DistanceType_Precomputed
+	// Aliases
+	CosineSimilarity         DistanceType = C.DistanceType_CosineSimilarity
+	Jaccard                  DistanceType = C.DistanceType_Jaccard
+	Hamming                  DistanceType = C.DistanceType_Hamming
+	Unknown                  DistanceType = C.DistanceType_Unknown
+)
+
+
+// Quantization maps to C.quantization_t
+type Quantization C.quantization_t
+
+const (
+    F32   Quantization = C.Quantization_F32
+    F16   Quantization = C.Quantization_F16
+    INT8  Quantization = C.Quantization_INT8
+    UINT8 Quantization = C.Quantization_UINT8
+)
+
+// DistributionMode maps to C.distribution_mode_t
+type DistributionMode C.distribution_mode_t
+
+const (
+    SingleGpu  DistributionMode = C.DistributionMode_SINGLE_GPU
+    Sharded    DistributionMode = C.DistributionMode_SHARDED
+    Replicated DistributionMode = C.DistributionMode_REPLICATED
+)
+
+// CagraBuildParams maps to C.cagra_build_params_t
+type CagraBuildParams struct {
+    IntermediateGraphDegree uint64
+    GraphDegree             uint64
+    AttachDatasetOnBuild    bool
+}
+
+func DefaultCagraBuildParams() CagraBuildParams {
+    return CagraBuildParams{
+        IntermediateGraphDegree: 128,
+        GraphDegree:             64,
+        AttachDatasetOnBuild:    true,
+    }
+}
+
+// CagraSearchParams maps to C.cagra_search_params_t
+type CagraSearchParams struct {
+    ItopkSize   uint64
+    SearchWidth uint64
+}
+
+func DefaultCagraSearchParams() CagraSearchParams {
+    return CagraSearchParams{
+        ItopkSize:   64,
+        SearchWidth: 1,
+    }
+}
+
+// IvfFlatBuildParams maps to C.ivf_flat_build_params_t
+type IvfFlatBuildParams struct {
+    NLists                 uint32
+    AddDataOnBuild         bool
+    KmeansTrainsetFraction float64
+}
+
+func DefaultIvfFlatBuildParams() IvfFlatBuildParams {
+    return IvfFlatBuildParams{
+        NLists:                 1024,
+        AddDataOnBuild:         true,
+        KmeansTrainsetFraction: 0.5,
+    }
+}
+
+// IvfFlatSearchParams maps to C.ivf_flat_search_params_t
+type IvfFlatSearchParams struct {
+    NProbes uint32
+}
+
+func DefaultIvfFlatSearchParams() IvfFlatSearchParams {
+    return IvfFlatSearchParams{
+        NProbes: 20,
+    }
+}
+
+// Float16 is a 16-bit floating point type (IEEE 754-2008).
+// Go does not have a native float16 type, so we use uint16 to represent its memory layout.
+type Float16 uint16
+
+// VectorType is a constraint for types that can be used as vector data.
+type VectorType interface {
+    float32 | Float16 | int8 | uint8
+}
+
+// GetQuantization returns the Quantization enum for a given VectorType.
+func GetQuantization[T VectorType]() Quantization {
+    var zero T
+    switch any(zero).(type) {
+    case float32:
+        return F32
+    case Float16:
+        return F16
+    case int8:
+        return INT8
+    case uint8:
+        return UINT8
+    default:
+        panic("unsupported vector type")
+    }
+}
+
+// GpuConvertF32ToF16 converts a float32 slice to a Float16 slice using the GPU.
+func GpuConvertF32ToF16(src []float32, dst []Float16, deviceID int) error {
+    if len(src) == 0 {
+        return nil
+    }
+    if len(src) != len(dst) {
+        return moerr.NewInternalErrorNoCtx("source and destination slices must have the same length")
+    }
+
+    var errmsg *C.char
+    C.gpu_convert_f32_to_f16(
+        (*C.float)(unsafe.Pointer(&src[0])),
+        unsafe.Pointer(&dst[0]),
+        C.uint64_t(len(src)),
+        C.int(deviceID),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(src)
+    runtime.KeepAlive(dst)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// GetGpuDeviceCount returns the number of available CUDA devices.
+func GetGpuDeviceCount() (int, error) {
+    count := int(C.gpu_get_device_count())
+    if count < 0 {
+        return 0, moerr.NewInternalErrorNoCtx("failed to get GPU device count")
+    }
+    return count, nil
+}
+
+// GetGpuDeviceList returns a slice of available CUDA device IDs.
+func GetGpuDeviceList() ([]int, error) {
+    count, err := GetGpuDeviceCount()
+    if err != nil {
+        return nil, err
+    }
+    if count == 0 {
+        return []int{}, nil
+    }
+
+    cDevices := make([]C.int, count)
+    actualCount := int(C.gpu_get_device_list(&cDevices[0], C.int(count)))
+    
+    devices := make([]int, actualCount)
+    for i := 0; i < actualCount; i++ {
+        devices[i] = int(cDevices[i])
+    }
+    runtime.KeepAlive(cDevices)
+    return devices, nil
+}
diff --git a/pkg/cuvs/helper_test.go b/pkg/cuvs/helper_test.go
new file mode 100644
index 0000000000000..b2986f23dde44
--- /dev/null
+++ b/pkg/cuvs/helper_test.go
@@ -0,0 +1,50 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+import (
+    "testing"
+)
+
+func TestGpuHelpers(t *testing.T) {
+    count, err := GetGpuDeviceCount()
+    if err != nil {
+        t.Fatalf("GetGpuDeviceCount failed: %v", err)
+    }
+    t.Logf("GPU Device Count: %d", count)
+
+    devices, err := GetGpuDeviceList()
+    if err != nil {
+        t.Fatalf("GetGpuDeviceList failed: %v", err)
+    }
+    t.Logf("GPU Device List: %v", devices)
+}
+
+func TestGpuConvertF32ToF16(t *testing.T) {
+    src := []float32{1.0, 2.0, 3.0, 4.0}
+    deviceID := 0
+
+    // Test conversion to F16
+    dstF16 := make([]Float16, len(src))
+    if err := GpuConvertF32ToF16(src, dstF16, deviceID); err != nil {
+        t.Fatalf("GpuConvertF32ToF16 failed: %v", err)
+    }
+    // We can't easily verify the value without a float16 decoder, 
+    // but we can check it didn't error.
+}
diff --git a/pkg/cuvs/ivf_flat.go b/pkg/cuvs/ivf_flat.go
new file mode 100644
index 0000000000000..72f6daafff04e
--- /dev/null
+++ b/pkg/cuvs/ivf_flat.go
@@ -0,0 +1,269 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/ivf_flat_c.h"
+#include <stdlib.h>
+#include <stdbool.h>
+*/
+import "C"
+import (
+    "runtime"
+    "unsafe"
+    "github.com/matrixorigin/matrixone/pkg/common/moerr"
+)
+
+// GpuIvfFlat represents the C++ gpu_ivf_flat_t object.
+type GpuIvfFlat[T VectorType] struct {
+    cIvfFlat  C.gpu_ivf_flat_c
+    dimension uint32
+}
+
+// NewGpuIvfFlat creates a new GpuIvfFlat instance from a dataset.
+func NewGpuIvfFlat[T VectorType](dataset []T, count uint64, dimension uint32, metric DistanceType, 
+                                 bp IvfFlatBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfFlat[T], error) {
+    if len(devices) == 0 {
+        return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+    }
+
+    qtype := GetQuantization[T]()
+    var errmsg *C.char
+    cDevices := make([]C.int, len(devices))
+    for i, d := range devices {
+        cDevices[i] = C.int(d)
+    }
+
+    cBP := C.ivf_flat_build_params_t{
+        n_lists:                  C.uint32_t(bp.NLists),
+        add_data_on_build:        C.bool(bp.AddDataOnBuild),
+        kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+    }
+
+    cIvfFlat := C.gpu_ivf_flat_new(
+        unsafe.Pointer(&dataset[0]),
+        C.uint64_t(count),
+        C.uint32_t(dimension),
+        C.distance_type_t(metric),
+        cBP,
+        &cDevices[0],
+        C.int(len(devices)),
+        C.uint32_t(nthread),
+        C.distribution_mode_t(mode),
+        C.quantization_t(qtype),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(dataset)
+    runtime.KeepAlive(cDevices)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if cIvfFlat == nil {
+        return nil, moerr.NewInternalErrorNoCtx("failed to create GpuIvfFlat")
+    }
+
+    return &GpuIvfFlat[T]{cIvfFlat: cIvfFlat, dimension: dimension}, nil
+}
+
+// NewGpuIvfFlatFromFile creates a new GpuIvfFlat instance by loading from a file.
+func NewGpuIvfFlatFromFile[T VectorType](filename string, dimension uint32, metric DistanceType, 
+                                         bp IvfFlatBuildParams, devices []int, nthread uint32, mode DistributionMode) (*GpuIvfFlat[T], error) {
+    if len(devices) == 0 {
+        return nil, moerr.NewInternalErrorNoCtx("at least one device must be specified")
+    }
+
+    qtype := GetQuantization[T]()
+    var errmsg *C.char
+    cFilename := C.CString(filename)
+    defer C.free(unsafe.Pointer(cFilename))
+
+    cDevices := make([]C.int, len(devices))
+    for i, d := range devices {
+        cDevices[i] = C.int(d)
+    }
+
+    cBP := C.ivf_flat_build_params_t{
+        n_lists:                  C.uint32_t(bp.NLists),
+        add_data_on_build:        C.bool(bp.AddDataOnBuild),
+        kmeans_trainset_fraction: C.double(bp.KmeansTrainsetFraction),
+    }
+
+    cIvfFlat := C.gpu_ivf_flat_load_file(
+        cFilename,
+        C.uint32_t(dimension),
+        C.distance_type_t(metric),
+        cBP,
+        &cDevices[0],
+        C.int(len(devices)),
+        C.uint32_t(nthread),
+        C.distribution_mode_t(mode),
+        C.quantization_t(qtype),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(cDevices)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if cIvfFlat == nil {
+        return nil, moerr.NewInternalErrorNoCtx("failed to load GpuIvfFlat from file")
+    }
+
+    return &GpuIvfFlat[T]{cIvfFlat: cIvfFlat, dimension: dimension}, nil
+}
+
+// Destroy frees the C++ gpu_ivf_flat_t instance
+func (gi *GpuIvfFlat[T]) Destroy() error {
+    if gi.cIvfFlat == nil {
+        return nil
+    }
+    var errmsg *C.char
+    C.gpu_ivf_flat_destroy(gi.cIvfFlat, unsafe.Pointer(&errmsg))
+    gi.cIvfFlat = nil
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Load triggers the build or file loading process
+func (gi *GpuIvfFlat[T]) Load() error {
+    if gi.cIvfFlat == nil {
+        return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+    }
+    var errmsg *C.char
+    C.gpu_ivf_flat_load(gi.cIvfFlat, unsafe.Pointer(&errmsg))
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Save serializes the index to a file
+func (gi *GpuIvfFlat[T]) Save(filename string) error {
+    if gi.cIvfFlat == nil {
+        return moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+    }
+    var errmsg *C.char
+    cFilename := C.CString(filename)
+    defer C.free(unsafe.Pointer(cFilename))
+
+    C.gpu_ivf_flat_save(gi.cIvfFlat, cFilename, unsafe.Pointer(&errmsg))
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Search performs a K-Nearest Neighbor search
+func (gi *GpuIvfFlat[T]) Search(queries []T, numQueries uint64, dimension uint32, limit uint32, sp IvfFlatSearchParams) (SearchResultIvfFlat, error) {
+    if gi.cIvfFlat == nil {
+        return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+    }
+    if len(queries) == 0 || numQueries == 0 {
+        return SearchResultIvfFlat{}, nil
+    }
+
+    var errmsg *C.char
+    cSP := C.ivf_flat_search_params_t{
+        n_probes: C.uint32_t(sp.NProbes),
+    }
+
+    res := C.gpu_ivf_flat_search(
+        gi.cIvfFlat,
+        unsafe.Pointer(&queries[0]),
+        C.uint64_t(numQueries),
+        C.uint32_t(dimension),
+        C.uint32_t(limit),
+        cSP,
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(queries)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if res.result_ptr == nil {
+        return SearchResultIvfFlat{}, moerr.NewInternalErrorNoCtx("search returned nil result")
+    }
+
+    totalElements := uint64(numQueries) * uint64(limit)
+    neighbors := make([]int64, totalElements)
+    distances := make([]float32, totalElements)
+
+    C.gpu_ivf_flat_get_neighbors(res.result_ptr, C.uint64_t(totalElements), (*C.int64_t)(unsafe.Pointer(&neighbors[0])))
+    C.gpu_ivf_flat_get_distances(res.result_ptr, C.uint64_t(totalElements), (*C.float)(unsafe.Pointer(&distances[0])))
+    runtime.KeepAlive(neighbors)
+    runtime.KeepAlive(distances)
+
+    C.gpu_ivf_flat_free_result(res.result_ptr)
+
+    return SearchResultIvfFlat{
+        Neighbors: neighbors,
+        Distances: distances,
+    }, nil
+}
+
+// GetCenters retrieves the trained centroids.
+func (gi *GpuIvfFlat[T]) GetCenters(nLists uint32) ([]float32, error) {
+    if gi.cIvfFlat == nil {
+        return nil, moerr.NewInternalErrorNoCtx("GpuIvfFlat is not initialized")
+    }
+    centers := make([]float32, nLists*gi.dimension)
+    var errmsg *C.char
+    C.gpu_ivf_flat_get_centers(gi.cIvfFlat, (*C.float)(&centers[0]), unsafe.Pointer(&errmsg))
+    runtime.KeepAlive(centers)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return centers, nil
+}
+
+// GetNList retrieves the number of lists (centroids) in the index.
+func (gi *GpuIvfFlat[T]) GetNList() uint32 {
+    if gi.cIvfFlat == nil {
+        return 0
+    }
+    return uint32(C.gpu_ivf_flat_get_n_list(gi.cIvfFlat))
+}
+
+// SearchResultIvfFlat contains the neighbors and distances from an IVF-Flat search.
+type SearchResultIvfFlat struct {
+    Neighbors []int64
+    Distances []float32
+}
diff --git a/pkg/cuvs/ivf_flat_test.go b/pkg/cuvs/ivf_flat_test.go
new file mode 100644
index 0000000000000..d2a664440ee44
--- /dev/null
+++ b/pkg/cuvs/ivf_flat_test.go
@@ -0,0 +1,152 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+import (
+    "os"
+    "testing"
+)
+
+func TestGpuIvfFlat(t *testing.T) {
+    dimension := uint32(2)
+    n_vectors := uint64(1000)
+    dataset := make([]float32, n_vectors*uint64(dimension))
+    for i := uint64(0); i < n_vectors; i++ {
+        dataset[i*uint64(dimension)] = float32(i)
+        dataset[i*uint64(dimension)+1] = float32(i)
+    }
+
+    devices := []int{0}
+    bp := DefaultIvfFlatBuildParams()
+    bp.NLists = 10
+    index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    if err != nil {
+        t.Fatalf("Failed to create GpuIvfFlat: %v", err)
+    }
+    defer index.Destroy()
+
+    err = index.Load()
+    if err != nil {
+        t.Fatalf("Failed to load/build GpuIvfFlat: %v", err)
+    }
+
+    centers, err := index.GetCenters(10)
+    if err != nil {
+        t.Fatalf("GetCenters failed: %v", err)
+    }
+    t.Logf("Centers: %v", centers[:4])
+
+    queries := []float32{1.0, 1.0, 100.0, 100.0}
+    sp := DefaultIvfFlatSearchParams()
+    sp.NProbes = 5
+    result, err := index.Search(queries, 2, dimension, 1, sp)
+    if err != nil {
+        t.Fatalf("Search failed: %v", err)
+    }
+
+    t.Logf("Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+    if result.Neighbors[0] != 1 {
+        t.Errorf("Expected neighbor 1, got %d", result.Neighbors[0])
+    }
+    if result.Neighbors[1] != 100 {
+        t.Errorf("Expected neighbor 100, got %d", result.Neighbors[1])
+    }
+}
+
+func TestGpuIvfFlatSaveLoad(t *testing.T) {
+    dimension := uint32(2)
+    n_vectors := uint64(100)
+    dataset := make([]float32, n_vectors*uint64(dimension))
+    for i := range dataset { dataset[i] = float32(i) }
+
+    devices := []int{0}
+    bp := DefaultIvfFlatBuildParams()
+    bp.NLists = 2
+    index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    if err != nil {
+        t.Fatalf("Failed to create GpuIvfFlat: %v", err)
+    }
+    index.Load()
+
+    filename := "test_ivf_flat.idx"
+    err = index.Save(filename)
+    if err != nil {
+        t.Fatalf("Save failed: %v", err)
+    }
+    defer os.Remove(filename)
+    index.Destroy()
+
+    index2, err := NewGpuIvfFlatFromFile[float32](filename, dimension, L2Expanded, bp, devices, 1, SingleGpu)
+    if err != nil {
+        t.Fatalf("Failed to create GpuIvfFlat from file: %v", err)
+    }
+    defer index2.Destroy()
+
+    err = index2.Load()
+    if err != nil {
+        t.Fatalf("Load from file failed: %v", err)
+    }
+
+    queries := []float32{0.0, 0.0}
+    sp := DefaultIvfFlatSearchParams()
+    result, err := index2.Search(queries, 1, dimension, 1, sp)
+    if err != nil {
+        t.Fatalf("Search failed: %v", err)
+    }
+    if result.Neighbors[0] != 0 {
+        t.Errorf("Expected 0, got %d", result.Neighbors[0])
+    }
+}
+
+func TestGpuShardedIvfFlat(t *testing.T) {
+    count, _ := GetGpuDeviceCount()
+    if count < 1 {
+        t.Skip("Need at least 1 GPU for sharded IVF-Flat test")
+    }
+    
+    devices := []int{0}
+    dimension := uint32(2)
+    n_vectors := uint64(100)
+    dataset := make([]float32, n_vectors*uint64(dimension))
+    for i := uint64(0); i < n_vectors; i++ {
+        dataset[i*uint64(dimension)] = float32(i)
+        dataset[i*uint64(dimension)+1] = float32(i)
+    }
+
+    bp := DefaultIvfFlatBuildParams()
+    bp.NLists = 5
+    index, err := NewGpuIvfFlat[float32](dataset, n_vectors, dimension, L2Expanded, bp, devices, 1, Sharded)
+    if err != nil {
+        t.Fatalf("Failed to create sharded IVF-Flat: %v", err)
+    }
+    defer index.Destroy()
+
+    err = index.Load()
+    if err != nil {
+        t.Fatalf("Load sharded failed: %v", err)
+    }
+
+    queries := []float32{0.1, 0.1, 0.2, 0.2, 0.3, 0.3, 0.4, 0.4, 0.5, 0.5}
+    sp := DefaultIvfFlatSearchParams()
+    result, err := index.Search(queries, 5, dimension, 1, sp)
+    if err != nil {
+        t.Fatalf("Search sharded failed: %v", err)
+    }
+    t.Logf("Sharded Neighbors: %v, Distances: %v", result.Neighbors, result.Distances)
+}
diff --git a/pkg/cuvs/kmeans.go b/pkg/cuvs/kmeans.go
new file mode 100644
index 0000000000000..06f49ad85bf88
--- /dev/null
+++ b/pkg/cuvs/kmeans.go
@@ -0,0 +1,201 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+/*
+#include "../../cgo/cuvs/kmeans_c.h"
+#include <stdlib.h>
+#include <stdbool.h>
+*/
+import "C"
+import (
+    "runtime"
+    "unsafe"
+    "github.com/matrixorigin/matrixone/pkg/common/moerr"
+)
+
+// GpuKMeans represents the C++ gpu_kmeans_t object.
+type GpuKMeans[T VectorType] struct {
+    cKMeans C.gpu_kmeans_c
+    nClusters uint32
+    dimension uint32
+}
+
+// NewGpuKMeans creates a new GpuKMeans instance.
+func NewGpuKMeans[T VectorType](nClusters uint32, dimension uint32, metric DistanceType, maxIter int, deviceID int, nthread uint32) (*GpuKMeans[T], error) {
+    qtype := GetQuantization[T]()
+
+    var errmsg *C.char
+    cKMeans := C.gpu_kmeans_new(
+        C.uint32_t(nClusters),
+        C.uint32_t(dimension),
+        C.distance_type_t(metric),
+        C.int(maxIter),
+        C.int(deviceID),
+        C.uint32_t(nthread),
+        C.quantization_t(qtype),
+        unsafe.Pointer(&errmsg),
+    )
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if cKMeans == nil {
+        return nil, moerr.NewInternalErrorNoCtx("failed to create GpuKMeans")
+    }
+    return &GpuKMeans[T]{cKMeans: cKMeans, nClusters: nClusters, dimension: dimension}, nil
+}
+
+// Destroy frees the C++ gpu_kmeans_t instance
+func (gk *GpuKMeans[T]) Destroy() error {
+    if gk.cKMeans == nil {
+        return nil
+    }
+    var errmsg *C.char
+    C.gpu_kmeans_destroy(gk.cKMeans, unsafe.Pointer(&errmsg))
+    gk.cKMeans = nil
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return nil
+}
+
+// Fit computes the cluster centroids.
+func (gk *GpuKMeans[T]) Fit(dataset []T, nSamples uint64) (float32, int64, error) {
+    if gk.cKMeans == nil {
+        return 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+    }
+    if len(dataset) == 0 || nSamples == 0 {
+        return 0, 0, nil
+    }
+
+    var errmsg *C.char
+    res := C.gpu_kmeans_fit(
+        gk.cKMeans,
+        unsafe.Pointer(&dataset[0]),
+        C.uint64_t(nSamples),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(dataset)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    return float32(res.inertia), int64(res.n_iter), nil
+}
+
+// Predict assigns labels to new data based on existing centroids.
+func (gk *GpuKMeans[T]) Predict(dataset []T, nSamples uint64) ([]int64, float32, error) {
+    if gk.cKMeans == nil {
+        return nil, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+    }
+    if len(dataset) == 0 || nSamples == 0 {
+        return nil, 0, nil
+    }
+
+    var errmsg *C.char
+    res := C.gpu_kmeans_predict(
+        gk.cKMeans,
+        unsafe.Pointer(&dataset[0]),
+        C.uint64_t(nSamples),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(dataset)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, 0, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if res.result_ptr == nil {
+        return nil, 0, moerr.NewInternalErrorNoCtx("predict returned nil result")
+    }
+
+    labels := make([]int64, nSamples)
+    C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0])))
+    runtime.KeepAlive(labels)
+
+    C.gpu_kmeans_free_result(res.result_ptr)
+
+    return labels, float32(res.inertia), nil
+}
+
+// FitPredict performs both fitting and labeling in one step.
+func (gk *GpuKMeans[T]) FitPredict(dataset []T, nSamples uint64) ([]int64, float32, int64, error) {
+    if gk.cKMeans == nil {
+        return nil, 0, 0, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+    }
+    if len(dataset) == 0 || nSamples == 0 {
+        return nil, 0, 0, nil
+    }
+
+    var errmsg *C.char
+    res := C.gpu_kmeans_fit_predict(
+        gk.cKMeans,
+        unsafe.Pointer(&dataset[0]),
+        C.uint64_t(nSamples),
+        unsafe.Pointer(&errmsg),
+    )
+    runtime.KeepAlive(dataset)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, 0, 0, moerr.NewInternalErrorNoCtx(errStr)
+    }
+
+    if res.result_ptr == nil {
+        return nil, 0, 0, moerr.NewInternalErrorNoCtx("fit_predict returned nil result")
+    }
+
+    labels := make([]int64, nSamples)
+    C.gpu_kmeans_get_labels(res.result_ptr, C.uint64_t(nSamples), (*C.int64_t)(unsafe.Pointer(&labels[0])))
+    runtime.KeepAlive(labels)
+
+    C.gpu_kmeans_free_result(res.result_ptr)
+
+    return labels, float32(res.inertia), int64(res.n_iter), nil
+}
+
+// GetCentroids retrieves the trained centroids.
+func (gk *GpuKMeans[T]) GetCentroids() ([]T, error) {
+    if gk.cKMeans == nil {
+        return nil, moerr.NewInternalErrorNoCtx("GpuKMeans is not initialized")
+    }
+    centroids := make([]T, gk.nClusters*gk.dimension)
+    var errmsg *C.char
+    C.gpu_kmeans_get_centroids(gk.cKMeans, unsafe.Pointer(&centroids[0]), unsafe.Pointer(&errmsg))
+    runtime.KeepAlive(centroids)
+
+    if errmsg != nil {
+        errStr := C.GoString(errmsg)
+        C.free(unsafe.Pointer(errmsg))
+        return nil, moerr.NewInternalErrorNoCtx(errStr)
+    }
+    return centroids, nil
+}
diff --git a/pkg/cuvs/kmeans_test.go b/pkg/cuvs/kmeans_test.go
new file mode 100644
index 0000000000000..faae9c5f579bc
--- /dev/null
+++ b/pkg/cuvs/kmeans_test.go
@@ -0,0 +1,170 @@
+//go:build gpu
+
+// Copyright 2021 - 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+package cuvs
+
+import (
+    "testing"
+    "fmt"
+)
+
+func TestGpuKMeans_Float32(t *testing.T) {
+    nClusters := uint32(3)
+    dimension := uint32(2)
+    nSamples := uint64(9)
+
+    // Create 3 clusters
+    dataset := []float32{
+        0.1, 0.1,   0.0, 0.2,   0.2, 0.0,  // Cluster 0
+        10.1, 10.1, 10.0, 10.2, 10.2, 10.0, // Cluster 1
+        20.1, 20.1, 20.0, 20.2, 20.2, 20.0, // Cluster 2
+    }
+
+    deviceID := 0
+    kmeans, err := NewGpuKMeans[float32](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+    if err != nil {
+        t.Fatalf("Failed to create GpuKMeans: %v", err)
+    }
+    defer kmeans.Destroy()
+
+    inertia, nIter, err := kmeans.Fit(dataset, nSamples)
+    if err != nil {
+        t.Fatalf("Fit failed: %v", err)
+    }
+    fmt.Printf("Fit: inertia=%f, nIter=%d\n", inertia, nIter)
+
+    labels, pInertia, err := kmeans.Predict(dataset, nSamples)
+    if err != nil {
+        t.Fatalf("Predict failed: %v", err)
+    }
+    fmt.Printf("Predict labels: %v, inertia=%f\n", labels, pInertia)
+
+    if len(labels) != int(nSamples) {
+        t.Errorf("Expected %d labels, got %d", nSamples, len(labels))
+    }
+
+    // Since we use balanced_params, it might prioritize balancing cluster sizes over spatial distance 
+    // on very small datasets. We just check that all labels are within range [0, nClusters).
+    for i, l := range labels {
+        if l < 0 || l >= int64(nClusters) {
+            t.Errorf("Label at index %d is out of range: %d", i, l)
+        }
+    }
+
+    centroids, err := kmeans.GetCentroids()
+    if err != nil {
+        t.Fatalf("GetCentroids failed: %v", err)
+    }
+    if len(centroids) != int(nClusters*dimension) {
+        t.Errorf("Expected %d centroid elements, got %d", nClusters*dimension, len(centroids))
+    }
+}
+
+func TestGpuKMeans_FitPredict_Float16(t *testing.T) {
+    nClusters := uint32(2)
+    dimension := uint32(4)
+    nSamples := uint64(10)
+
+    dataset := make([]float32, nSamples*uint64(dimension))
+    for i := range dataset {
+        dataset[i] = 0.5
+    }
+    
+    // Convert to F16
+    datasetF16 := make([]Float16, len(dataset))
+    err := GpuConvertF32ToF16(dataset, datasetF16, 0)
+    if err != nil {
+        t.Fatalf("F32 to F16 conversion failed: %v", err)
+    }
+
+    deviceID := 0
+    kmeans, err := NewGpuKMeans[Float16](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+    if err != nil {
+        t.Fatalf("Failed to create GpuKMeans: %v", err)
+    }
+    defer kmeans.Destroy()
+
+    labels, inertia, nIter, err := kmeans.FitPredict(datasetF16, nSamples)
+    if err != nil {
+        t.Fatalf("FitPredict failed: %v", err)
+    }
+    fmt.Printf("FitPredict: inertia=%f, nIter=%d\n", inertia, nIter)
+    if len(labels) != int(nSamples) {
+        t.Errorf("Expected %d labels, got %d", nSamples, len(labels))
+    }
+}
+
+func TestGpuKMeans_Int8(t *testing.T) {
+    nClusters := uint32(2)
+    dimension := uint32(2)
+    nSamples := uint64(4)
+
+    dataset := []int8{
+        0, 0,
+        1, 1,
+        10, 10,
+        11, 11,
+    }
+
+    deviceID := 0
+    kmeans, err := NewGpuKMeans[int8](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+    if err != nil {
+        t.Fatalf("Failed to create GpuKMeans: %v", err)
+    }
+    defer kmeans.Destroy()
+
+    labels, _, _, err := kmeans.FitPredict(dataset, nSamples)
+    if err != nil {
+        t.Fatalf("FitPredict failed: %v", err)
+    }
+    fmt.Printf("Int8 Predict labels: %v\n", labels)
+
+    if len(labels) != int(nSamples) {
+        t.Errorf("Expected %d labels, got %d", nSamples, len(labels))
+    }
+}
+
+func TestGpuKMeans_Uint8(t *testing.T) {
+    nClusters := uint32(2)
+    dimension := uint32(2)
+    nSamples := uint64(4)
+
+    dataset := []uint8{
+        0, 0,
+        1, 1,
+        10, 10,
+        11, 11,
+    }
+
+    deviceID := 0
+    kmeans, err := NewGpuKMeans[uint8](nClusters, dimension, L2Expanded, 20, deviceID, 1)
+    if err != nil {
+        t.Fatalf("Failed to create GpuKMeans: %v", err)
+    }
+    defer kmeans.Destroy()
+
+    labels, _, _, err := kmeans.FitPredict(dataset, nSamples)
+    if err != nil {
+        t.Fatalf("FitPredict failed: %v", err)
+    }
+    fmt.Printf("Uint8 Predict labels: %v\n", labels)
+
+    if len(labels) != int(nSamples) {
+        t.Errorf("Expected %d labels, got %d", nSamples, len(labels))
+    }
+}
diff --git a/pkg/frontend/variables.go b/pkg/frontend/variables.go
index 72895372b8e01..488c263fbb385 100644
--- a/pkg/frontend/variables.go
+++ b/pkg/frontend/variables.go
@@ -3607,14 +3607,6 @@ var gSysVarsDefs = map[string]SystemVariable{
 		Type:              InitSystemVariableBoolType("ivf_preload_entries"),
 		Default:           int8(0),
 	},
-	"ivf_small_centroid_threshold": {
-		Name:              "ivf_small_centroid_threshold",
-		Scope:             ScopeBoth,
-		Dynamic:           true,
-		SetVarHintApplies: false,
-		Type:              InitSystemVariableIntType("ivf_small_centroid_threshold", 0, 1024, false),
-		Default:           int64(0),
-	},
 	"enable_vector_prefilter_by_default": {
 		Name:              "enable_vector_prefilter_by_default",
 		Scope:             ScopeSession,
diff --git a/pkg/sql/colexec/productl2/product_l2.go b/pkg/sql/colexec/productl2/product_l2.go
index 33472c3c1071c..ad3b1372ab7f7 100644
--- a/pkg/sql/colexec/productl2/product_l2.go
+++ b/pkg/sql/colexec/productl2/product_l2.go
@@ -18,6 +18,7 @@ import (
 	"bytes"
 	"runtime"
 	"strings"
+	"sync"
 	"time"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
@@ -58,6 +59,10 @@ func (productl2 *Productl2) Prepare(proc *process.Process) error {
 	}
 	productl2.ctr.metrictype = metrictype
 
+	if productl2.ctr.sqlproc == nil {
+		productl2.ctr.sqlproc = sqlexec.NewSqlProcess(proc)
+	}
+
 	return nil
 }
 
@@ -127,14 +132,7 @@ func (productl2 *Productl2) Call(proc *process.Process) (vm.CallResult, error) {
 
 }
 
-func NewNullVector[T types.RealNumbers](dim int32) []T {
-	// null vector with magnitude 1
-	nullvec := make([]T, dim)
-	nullvec[0] = 1
-	return nullvec
-}
-
-func getIndex[T types.RealNumbers](ap *Productl2, proc *process.Process, analyzer process.Analyzer) (cache.VectorIndexSearchIf, error) {
+func getIndex[T types.RealNumbers](ap *Productl2, proc *process.Process, analyzer process.Analyzer, centers [][]T, nullvec []T) (cache.VectorIndexSearchIf, error) {
 	ctr := &ap.ctr
 	buildCount := ctr.bat.RowCount()
 	centroidColPos := ap.OnExpr.GetF().GetArgs()[0].GetCol().GetColPos()
@@ -143,8 +141,13 @@ func getIndex[T types.RealNumbers](ap *Productl2, proc *process.Process, analyze
 
 	dim := centroidVec.GetType().Width
 	elemSize := uint(centroidVec.GetType().GetArrayElementSize())
-	centers := make([][]T, buildCount)
-	nullvec := NewNullVector[T](dim)
+
+	if len(nullvec) > 0 {
+		nullvec[0] = 1
+		for i := 1; i < len(nullvec); i++ {
+			nullvec[i] = 0
+		}
+	}
 
 	for i := 0; i < buildCount; i++ {
 		if centroidVec.IsNull(uint64(i)) {
@@ -156,12 +159,12 @@ func getIndex[T types.RealNumbers](ap *Productl2, proc *process.Process, analyze
 		centers[i] = c
 	}
 
-	algo, err := brute_force.NewBruteForceIndex[T](centers, uint(dim), ctr.metrictype, elemSize)
+	algo, err := brute_force.NewBruteForceIndex[T](centers, uint(dim), ctr.metrictype, elemSize, 1)
 	if err != nil {
 		return nil, err
 	}
 
-	err = algo.Load(sqlexec.NewSqlProcess(proc))
+	err = algo.Load(ctr.sqlproc)
 	if err != nil {
 		return nil, err
 	}
@@ -195,12 +198,16 @@ func (productl2 *Productl2) build(proc *process.Process, analyzer process.Analyz
 
 	switch centroidVec.GetType().Oid {
 	case types.T_array_float32:
-		ctr.brute_force, err = getIndex[float32](productl2, proc, analyzer)
+		ctr.centersF32 = get1D[[]float32](&pool2DF32, ctr.bat.RowCount())
+		ctr.nullvecF32 = get1D[float32](&pool1DF32, int(centroidVec.GetType().Width))
+		ctr.brute_force, err = getIndex[float32](productl2, proc, analyzer, *ctr.centersF32, *ctr.nullvecF32)
 		if err != nil {
 			return err
 		}
 	case types.T_array_float64:
-		ctr.brute_force, err = getIndex[float64](productl2, proc, analyzer)
+		ctr.centersF64 = get1D[[]float64](&pool2DF64, ctr.bat.RowCount())
+		ctr.nullvecF64 = get1D[float64](&pool1DF64, int(centroidVec.GetType().Width))
+		ctr.brute_force, err = getIndex[float64](productl2, proc, analyzer, *ctr.centersF64, *ctr.nullvecF64)
 		if err != nil {
 			return err
 		}
@@ -209,36 +216,59 @@ func (productl2 *Productl2) build(proc *process.Process, analyzer process.Analyz
 	return nil
 }
 
-//var (
-//	arrayF32Pool = sync.Pool{
-//		New: func() interface{} {
-//			s := make([]float32, 0)
-//			return &s
-//		},
-//	}
-//	arrayF64Pool = sync.Pool{
-//		New: func() interface{} {
-//			s := make([]float64, 0)
-//			return &s
-//		},
-//	}
-//)
-
-func newMat[T types.RealNumbers](ctr *container, ap *Productl2) ([][]T, error) {
+var (
+	pool1DF32 = sync.Pool{New: func() any { x := make([]float32, 0); return &x }}
+	pool1DF64 = sync.Pool{New: func() any { x := make([]float64, 0); return &x }}
+	pool2DF32 = sync.Pool{New: func() any { x := make([][]float32, 0); return &x }}
+	pool2DF64 = sync.Pool{New: func() any { x := make([][]float64, 0); return &x }}
+)
+
+func get1D[T any](pool *sync.Pool, n int) *[]T {
+	val := pool.Get()
+	if val == nil {
+		newSlice := make([]T, n)
+		return &newSlice
+	}
+	v, ok := val.(*[]T)
+	if !ok || v == nil {
+		newSlice := make([]T, n)
+		return &newSlice
+	}
+	if cap(*v) < n {
+		if n > 0 {
+			pool.Put(v)
+			newSlice := make([]T, n)
+			return &newSlice
+		}
+		*v = (*v)[:0]
+		return v
+	}
+	*v = (*v)[:n]
+	return v
+}
+
+func put1D[T any](pool *sync.Pool, v *[]T) {
+	var zero T
+	for i := range *v {
+		(*v)[i] = zero
+	}
+	*v = (*v)[:0]
+	pool.Put(v)
+}
+
+func newMat[T types.RealNumbers](ctr *container, ap *Productl2, probes [][]T, nullvec []T) ([][]T, error) {
 	probeCount := ctr.inBat.RowCount()
 	tblColPos := ap.OnExpr.GetF().GetArgs()[1].GetCol().GetColPos()
 	tblColVec := ctr.inBat.Vecs[tblColPos]
 
-	// dimension can only get from centroid column.  probe column input values can be null and dimension is 0.
-	centroidColPos := ap.OnExpr.GetF().GetArgs()[0].GetCol().GetColPos()
-	centroidVec := ctr.bat.Vecs[centroidColPos]
-	dim := centroidVec.GetType().Width
-	nullvec := NewNullVector[T](dim)
+	if len(nullvec) > 0 {
+		nullvec[0] = 1
+		for i := 1; i < len(nullvec); i++ {
+			nullvec[i] = 0
+		}
+	}
 
-	// embedding mat
-	probes := make([][]T, probeCount)
 	for j := 0; j < probeCount; j++ {
-
 		if tblColVec.IsNull(uint64(j)) {
 			probes[j] = nullvec
 			continue
@@ -266,6 +296,22 @@ func (ctr *container) release() {
 		ctr.brute_force.Destroy()
 		ctr.brute_force = nil
 	}
+	if ctr.centersF32 != nil {
+		put1D(&pool2DF32, ctr.centersF32)
+		ctr.centersF32 = nil
+	}
+	if ctr.centersF64 != nil {
+		put1D(&pool2DF64, ctr.centersF64)
+		ctr.centersF64 = nil
+	}
+	if ctr.nullvecF32 != nil {
+		put1D(&pool1DF32, ctr.nullvecF32)
+		ctr.nullvecF32 = nil
+	}
+	if ctr.nullvecF64 != nil {
+		put1D(&pool1DF64, ctr.nullvecF64)
+		ctr.nullvecF64 = nil
+	}
 }
 
 func probeRun[T types.RealNumbers](ctr *container, ap *Productl2, proc *process.Process, result *vm.CallResult) error {
@@ -273,6 +319,10 @@ func probeRun[T types.RealNumbers](ctr *container, ap *Productl2, proc *process.
 	tblColPos := ap.OnExpr.GetF().GetArgs()[1].GetCol().GetColPos()
 	tblColVec := ctr.inBat.Vecs[tblColPos]
 
+	centroidColPos := ap.OnExpr.GetF().GetArgs()[0].GetCol().GetColPos()
+	centroidVec := ctr.bat.Vecs[centroidColPos]
+	dim := int(centroidVec.GetType().Width)
+
 	ncpu := runtime.NumCPU()
 	if probeCount < ncpu {
 		ncpu = probeCount
@@ -285,14 +335,37 @@ func probeRun[T types.RealNumbers](ctr *container, ap *Productl2, proc *process.
 		}
 	}
 
-	probes, err := newMat[T](ctr, ap)
+	var _t T
+	var probes [][]T
+	var nullvec []T
+
+	switch any(_t).(type) {
+	case float32:
+		p := get1D[[]float32](&pool2DF32, probeCount)
+		defer put1D(&pool2DF32, p)
+		probes = any(*p).([][]T)
+
+		n := get1D[float32](&pool1DF32, dim)
+		defer put1D(&pool1DF32, n)
+		nullvec = any(*n).([]T)
+	case float64:
+		p := get1D[[]float64](&pool2DF64, probeCount)
+		defer put1D(&pool2DF64, p)
+		probes = any(*p).([][]T)
+
+		n := get1D[float64](&pool1DF64, dim)
+		defer put1D(&pool1DF64, n)
+		nullvec = any(*n).([]T)
+	}
+
+	probes, err := newMat[T](ctr, ap, probes, nullvec)
 	if err != nil {
 		return err
 	}
 
 	rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: uint(ncpu)}
 
-	anykeys, distances, err := ctr.brute_force.Search(sqlexec.NewSqlProcess(proc), probes, rt)
+	anykeys, distances, err := ctr.brute_force.Search(ctr.sqlproc, probes, rt)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sql/colexec/productl2/types.go b/pkg/sql/colexec/productl2/types.go
index 6effcf0a7d824..65f435150fd5e 100644
--- a/pkg/sql/colexec/productl2/types.go
+++ b/pkg/sql/colexec/productl2/types.go
@@ -22,6 +22,7 @@ import (
 	"github.com/matrixorigin/matrixone/pkg/sql/colexec"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
 	"github.com/matrixorigin/matrixone/pkg/vm"
 	"github.com/matrixorigin/matrixone/pkg/vm/process"
 )
@@ -41,6 +42,13 @@ type container struct {
 	inBat       *batch.Batch // probe batch
 	metrictype  metric.MetricType
 	brute_force cache.VectorIndexSearchIf // brute_force.BruteForceIndex
+
+	sqlproc *sqlexec.SqlProcess
+
+	centersF32 *[][]float32
+	centersF64 *[][]float64
+	nullvecF32 *[]float32
+	nullvecF64 *[]float64
 }
 
 type Productl2 struct {
diff --git a/pkg/sql/colexec/table_function/ivf_create.go b/pkg/sql/colexec/table_function/ivf_create.go
index 46c19ea38d850..a72e251314d63 100644
--- a/pkg/sql/colexec/table_function/ivf_create.go
+++ b/pkg/sql/colexec/table_function/ivf_create.go
@@ -25,6 +25,7 @@ import (
 	"github.com/matrixorigin/matrixone/pkg/container/batch"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/container/vector"
+	"github.com/matrixorigin/matrixone/pkg/logutil"
 	"github.com/matrixorigin/matrixone/pkg/sql/colexec"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat"
@@ -80,6 +81,7 @@ func clustering[T types.RealNumbers](u *ivfCreateState, tf *TableFunction, proc
 
 	nworker := vectorindex.GetConcurrencyForBuild(u.tblcfg.ThreadsBuild)
 
+	logutil.Infof("IVFFLAT START: Kmeans clustering")
 	// NOTE: We use L2 distance to caculate centroid.  Ivfflat metric just for searching.
 	var centers [][]T
 	if clusterer, err = device.NewKMeans(
@@ -99,6 +101,8 @@ func clustering[T types.RealNumbers](u *ivfCreateState, tf *TableFunction, proc
 		return err
 	}
 
+	logutil.Infof("IVFFLAT END: Kmeans clustering")
+
 	centers, ok = anycenters.([][]T)
 	if !ok {
 		return moerr.NewInternalError(proc.Ctx, "centers is not [][]float64")
@@ -115,6 +119,7 @@ func clustering[T types.RealNumbers](u *ivfCreateState, tf *TableFunction, proc
 		return moerr.NewInternalError(proc.Ctx, "output centroids is empty")
 	}
 
+	logutil.Infof("IVFFLAT START: After Kmeans clustering, insert centroids to table")
 	sql := fmt.Sprintf("INSERT INTO `%s`.`%s` (`%s`, `%s`, `%s`) VALUES %s", u.tblcfg.DbName, u.tblcfg.IndexTable,
 		catalog.SystemSI_IVFFLAT_TblCol_Centroids_version,
 		catalog.SystemSI_IVFFLAT_TblCol_Centroids_id,
@@ -131,6 +136,7 @@ func clustering[T types.RealNumbers](u *ivfCreateState, tf *TableFunction, proc
 		}
 		res.Close()
 	}
+	logutil.Infof("IVFFLAT END: After Kmeans clustering, insert centroids to table")
 
 	return nil
 }
@@ -260,20 +266,27 @@ func (u *ivfCreateState) start(tf *TableFunction, proc *process.Process, nthRow
 			}
 		}
 
+		if u.sample_ratio > 1.0 {
+			u.sample_ratio = 1.0
+		}
+
 		// run SQL
-		sql := fmt.Sprintf("SELECT `%s` FROM `%s`.`%s` WHERE `%s` IS NOT NULL AND RAND() < %f LIMIT %d",
+		sql := fmt.Sprintf("SELECT SAMPLE(`%s`, %f PERCENT) FROM `%s`.`%s` WHERE `%s` IS NOT NULL LIMIT %d",
 			u.tblcfg.KeyPart,
+			u.sample_ratio*100,
 			u.tblcfg.DbName,
 			u.tblcfg.SrcTable,
 			u.tblcfg.KeyPart,
-			u.sample_ratio,
 			u.nsample)
 
+		logutil.Infof("IVFFLAT START: pick sample. %s", sql)
+
 		res, err := ivf_runSql(sqlexec.NewSqlProcess(proc), sql)
 		if err != nil {
 			return err
 		}
 		defer res.Close()
+		logutil.Infof("IVFFLAT END: pick sample")
 
 		if len(res.Batches) == 0 {
 			return nil
diff --git a/pkg/vectorindex/brute_force/benchmark_test.go b/pkg/vectorindex/brute_force/benchmark_test.go
new file mode 100644
index 0000000000000..bfa2782154525
--- /dev/null
+++ b/pkg/vectorindex/brute_force/benchmark_test.go
@@ -0,0 +1,105 @@
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	"math/rand/v2"
+	"testing"
+
+	"github.com/matrixorigin/matrixone/pkg/common/mpool"
+	"github.com/matrixorigin/matrixone/pkg/testutil"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
+)
+
+func benchmarkBruteForceGeneric(b *testing.B, dsize, qsize int, dimension uint, ncpu uint, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) {
+	b.Helper()
+	m := mpool.MustNewZero()
+	proc := testutil.NewProcessWithMPool(b, "", m)
+	sqlproc := sqlexec.NewSqlProcess(proc)
+	limit := uint(10)
+	elemsz := uint(4) // float32
+
+	dataset := make([][]float32, dsize)
+	for i := range dataset {
+		dataset[i] = make([]float32, dimension)
+		for j := range dataset[i] {
+			dataset[i][j] = rand.Float32()
+		}
+	}
+
+	query := make([][]float32, qsize)
+	for i := range query {
+		query[i] = make([]float32, dimension)
+		for j := range query[i] {
+			query[i][j] = rand.Float32()
+		}
+	}
+
+	idx, err := createFn(dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu)
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer idx.Destroy()
+
+	err = idx.Load(sqlproc)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, _, err := idx.Search(sqlproc, query, rt)
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func benchmarkBruteForce(b *testing.B, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) {
+	benchmarkBruteForceGeneric(b, 10000, 100, 1024, 8, createFn)
+}
+
+func benchmarkCentroidSearch(b *testing.B, createFn func([][]float32, uint, metric.MetricType, uint, uint) (cache.VectorIndexSearchIf, error)) {
+	benchmarkBruteForceGeneric(b, 18000, 1, 1024, 1, createFn)
+}
+
+func BenchmarkGoBruteForce(b *testing.B) {
+	benchmarkBruteForce(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewGoBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
+
+func BenchmarkUsearchBruteForce(b *testing.B) {
+	benchmarkBruteForce(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewUsearchBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
+
+func BenchmarkCentroidSearchGoBruteForce(b *testing.B) {
+	benchmarkCentroidSearch(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewGoBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
+
+func BenchmarkCentroidSearchUsearchBruteForce(b *testing.B) {
+	benchmarkCentroidSearch(b, func(dataset [][]float32, dim uint, m metric.MetricType, es uint, nt uint) (cache.VectorIndexSearchIf, error) {
+		return NewUsearchBruteForceIndex[float32](dataset, dim, m, es)
+	})
+}
diff --git a/pkg/vectorindex/brute_force/brute_force.go b/pkg/vectorindex/brute_force/brute_force.go
index 6c1d2fe899d10..bdf217dd75433 100644
--- a/pkg/vectorindex/brute_force/brute_force.go
+++ b/pkg/vectorindex/brute_force/brute_force.go
@@ -18,9 +18,9 @@ import (
 	"context"
 	"fmt"
 	"runtime"
-	"slices"
 
 	"github.com/matrixorigin/matrixone/pkg/common/concurrent"
+	"github.com/matrixorigin/matrixone/pkg/common/malloc"
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/common/util"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
@@ -29,16 +29,16 @@ import (
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
 	usearch "github.com/unum-cloud/usearch/golang"
-	"github.com/viterin/partial"
 )
 
 type UsearchBruteForceIndex[T types.RealNumbers] struct {
-	Dataset      []T // flattend vector
+	Dataset      *[]T // flattend vector
 	Metric       usearch.Metric
 	Dimension    uint
 	Count        uint
 	Quantization usearch.Quantization
 	ElementSize  uint
+	deallocator  malloc.Deallocator
 }
 
 type GoBruteForceIndex[T types.RealNumbers] struct {
@@ -67,12 +67,7 @@ func NewCpuBruteForceIndex[T types.RealNumbers](dataset [][]T,
 	m metric.MetricType,
 	elemsz uint) (cache.VectorIndexSearchIf, error) {
 
-	switch m {
-	case metric.Metric_L1Distance:
-		return NewGoBruteForceIndex(dataset, dimension, m, elemsz)
-	default:
-		return NewUsearchBruteForceIndex(dataset, dimension, m, elemsz)
-	}
+	return NewGoBruteForceIndex(dataset, dimension, m, elemsz)
 }
 
 func NewGoBruteForceIndex[T types.RealNumbers](dataset [][]T,
@@ -104,10 +99,38 @@ func NewUsearchBruteForceIndex[T types.RealNumbers](dataset [][]T,
 	idx.Count = uint(len(dataset))
 	idx.ElementSize = elemsz
 
-	idx.Dataset = make([]T, idx.Count*idx.Dimension)
+	reqSize := int(idx.Count * idx.Dimension)
+
+	allocator := malloc.NewCAllocator()
+
+	var _t T
+	switch any(_t).(type) {
+	case float32:
+		slice, deallocator, err := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear)
+		if err != nil {
+			return nil, err
+		}
+		idx.deallocator = deallocator
+		f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize)
+		idx.Dataset = any(&f32Slice).(*[]T)
+	case float64:
+		slice, deallocator, err := allocator.Allocate(uint64(reqSize)*8, malloc.NoClear)
+		if err != nil {
+			return nil, err
+		}
+		idx.deallocator = deallocator
+		f64Slice := util.UnsafeSliceCastToLength[float64](slice, reqSize)
+		idx.Dataset = any(&f64Slice).(*[]T)
+	default:
+		// Fallback
+		ds := make([]T, reqSize)
+		idx.Dataset = &ds
+	}
+
+	ds := *idx.Dataset
 	for i := 0; i < len(dataset); i++ {
 		offset := i * int(dimension)
-		copy(idx.Dataset[offset:], dataset[i])
+		copy(ds[offset:], dataset[i])
 	}
 
 	return idx, nil
@@ -124,14 +147,37 @@ func (idx *UsearchBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries
 	}
 
 	var flatten []T
-	if len(queries) == 1 {
-		flatten = queries[0]
-	} else {
-		flatten = make([]T, len(queries)*int(idx.Dimension))
-		for i := 0; i < len(queries); i++ {
-			offset := i * int(idx.Dimension)
-			copy(flatten[offset:], queries[i])
+	var queryDeallocator malloc.Deallocator
+
+	reqSize := len(queries) * int(idx.Dimension)
+	allocator := malloc.NewCAllocator()
+	var _t T
+	switch any(_t).(type) {
+	case float32:
+		slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear)
+		if err2 != nil {
+			return nil, nil, err2
 		}
+		queryDeallocator = dealloc
+		f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize)
+		flatten = any(f32Slice).([]T)
+	case float64:
+		slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*8, malloc.NoClear)
+		if err2 != nil {
+			return nil, nil, err2
+		}
+		queryDeallocator = dealloc
+		f64Slice := util.UnsafeSliceCastToLength[float64](slice, reqSize)
+		flatten = any(f64Slice).([]T)
+	}
+
+	for i := 0; i < len(queries); i++ {
+		offset := i * int(idx.Dimension)
+		copy(flatten[offset:], queries[i])
+	}
+
+	if queryDeallocator != nil {
+		defer queryDeallocator.Deallocate()
 	}
 	//fmt.Printf("flattened %v\n", flatten)
 
@@ -142,7 +188,7 @@ func (idx *UsearchBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries
 	}
 
 	keys_ui64, distances_f32, err := usearch.ExactSearchUnsafe(
-		util.UnsafePointer(&(idx.Dataset[0])),
+		util.UnsafePointer(&((*idx.Dataset)[0])),
 		util.UnsafePointer(&(flatten[0])),
 		uint(idx.Count),
 		uint(len(queries)),
@@ -179,6 +225,13 @@ func (idx *UsearchBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf
 }
 
 func (idx *UsearchBruteForceIndex[T]) Destroy() {
+	if idx.deallocator != nil {
+		idx.deallocator.Deallocate()
+		idx.deallocator = nil
+		idx.Dataset = nil
+	} else if idx.Dataset != nil {
+		idx.Dataset = nil
+	}
 }
 
 func (idx *GoBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) error {
@@ -204,90 +257,85 @@ func (idx *GoBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any,
 	}
 
 	nthreads := rt.NThreads
-
-	// datasize * nqueries
 	nqueries := len(queries)
-	ndataset := len(idx.Dataset)
+	limit := int(rt.Limit)
 
-	// create distance matric
-	results := make([][]vectorindex.SearchResult, nqueries)
-	for i := range results {
-		results[i] = make([]vectorindex.SearchResult, ndataset)
+	if limit == 0 {
+		return []int64{}, []float64{}, nil
 	}
 
+	totalReturn := nqueries * limit
+	retKeys64 := make([]int64, totalReturn)
+	retDistances := make([]float64, totalReturn)
+
 	exec := concurrent.NewThreadPoolExecutor(int(nthreads))
 	err = exec.Execute(
 		proc.GetContext(),
 		nqueries,
 		func(ctx context.Context, thread_id int, start, end int) (err2 error) {
-			subqueries := queries[start:end:end]
-			subresults := results[start:end:end]
-			for k, q := range subqueries {
+			// Pre-allocate heap buffers for this thread
+			var heapKeysBuf []int64
+			var heapDistBuf []T
+			if limit > 1 {
+				heapKeysBuf = make([]int64, limit)
+				heapDistBuf = make([]T, limit)
+			}
+
+			for k := start; k < end; k++ {
+				q := queries[k]
 				if k%100 == 0 && ctx.Err() != nil {
 					return ctx.Err()
 				}
 
+				if limit == 1 {
+					minDist := metric.MaxFloat[T]()
+					minIdx := -1
+					for j := range idx.Dataset {
+						dist, err2 := distfn(q, idx.Dataset[j])
+						if err2 != nil {
+							return err2
+						}
+						if dist < minDist {
+							minDist = dist
+							minIdx = j
+						}
+					}
+					retKeys64[k*limit] = int64(minIdx)
+					retDistances[k*limit] = float64(minDist)
+					continue
+				}
+
+				// Max-heap logic for K > 1
+				h := vectorindex.NewFastMaxHeap(limit, heapKeysBuf, heapDistBuf)
+
 				for j := range idx.Dataset {
 					dist, err2 := distfn(q, idx.Dataset[j])
 					if err2 != nil {
 						return err2
 					}
-					subresults[k][j].Id = int64(j)
-					subresults[k][j].Distance = float64(dist)
-				}
-			}
-			return
-		})
-
-	if err != nil {
-		return nil, nil, err
-	}
-
-	cmpfn := func(a, b vectorindex.SearchResult) int {
-		if a.Distance < b.Distance {
-			return -1
-		} else if a.Distance == b.Distance {
-			return 0
-		}
-		return 1
-	}
-
-	// get min
-	keys64 := make([]int64, nqueries*int(rt.Limit))
-	distances = make([]float64, nqueries*int(rt.Limit))
-	err = exec.Execute(
-		proc.GetContext(),
-		nqueries,
-		func(ctx context.Context, thread_id int, start, end int) (err2 error) {
-			subresults := results[start:end:end]
-			for j := range subresults {
-				if j%100 == 0 && ctx.Err() != nil {
-					return ctx.Err()
+					h.Push(int64(j), dist)
 				}
 
-				if rt.Limit == 1 {
-					// min
-					first := slices.MinFunc(subresults[j], cmpfn)
-					subresults[j][0] = first
-
-				} else {
-					// partial sort
-					partial.SortFunc(subresults[j], int(rt.Limit), cmpfn)
-
+				// Extract from heap and place into results in sorted order (smallest first)
+				offset := k * limit
+				for j := limit - 1; j >= 0; j-- {
+					key, dist, ok := h.Pop()
+					if !ok {
+						// Pad with invalid if not enough data
+						retKeys64[offset+j] = -1
+						retDistances[offset+j] = 0
+						continue
+					}
+					retKeys64[offset+j] = key
+					retDistances[offset+j] = float64(dist)
 				}
 			}
 			return
 		})
+
 	if err != nil {
 		return nil, nil, err
 	}
 
-	for i := 0; i < nqueries; i++ {
-		for j := 0; j < int(rt.Limit); j++ {
-			keys64[i*int(rt.Limit)+j] = results[i][j].Id
-			distances[i*int(rt.Limit)+j] = results[i][j].Distance
-		}
-	}
-
-	return keys64, distances, nil
+	return retKeys64, retDistances, nil
 }
diff --git a/pkg/vectorindex/brute_force/brute_force_test.go b/pkg/vectorindex/brute_force/brute_force_test.go
index 21cf130271463..7a119bbb8c8b6 100644
--- a/pkg/vectorindex/brute_force/brute_force_test.go
+++ b/pkg/vectorindex/brute_force/brute_force_test.go
@@ -19,6 +19,7 @@ package brute_force
 import (
 	"fmt"
 	"math/rand/v2"
+	"sort"
 	"testing"
 
 	"github.com/matrixorigin/matrixone/pkg/common/mpool"
@@ -151,3 +152,81 @@ func TestGoBruteForceConcurrent(t *testing.T) {
 func TestUsearchBruteForceConcurrent(t *testing.T) {
 	runBruteForceConcurrent(t, true)
 }
+
+func TestGoBruteForceHeapLogic(t *testing.T) {
+	// Generate random dataset
+	dsize := 1000
+	dimension := uint(16)
+	dataset := make([][]float32, dsize)
+	for i := range dataset {
+		dataset[i] = make([]float32, dimension)
+		for j := range dataset[i] {
+			dataset[i][j] = rand.Float32()
+		}
+	}
+
+	qsize := 10
+	queries := make([][]float32, qsize)
+	for i := range queries {
+		queries[i] = make([]float32, dimension)
+		for j := range queries[i] {
+			queries[i][j] = rand.Float32()
+		}
+	}
+
+	m := mpool.MustNewZero()
+	proc := testutil.NewProcessWithMPool(t, "", m)
+	sqlproc := sqlexec.NewSqlProcess(proc)
+	elemsz := uint(4)
+
+	idx, err := NewGoBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	require.NoError(t, err)
+
+	limits := []uint{1, 5, 50, 1000}
+
+	for _, limit := range limits {
+		rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 2}
+		keysAny, dists, err := idx.Search(sqlproc, queries, rt)
+		require.NoError(t, err)
+
+		keys := keysAny.([]int64)
+		require.Equal(t, int(limit)*qsize, len(keys))
+		require.Equal(t, int(limit)*qsize, len(dists))
+
+		// Verify correctness for each query
+		for i := 0; i < qsize; i++ {
+			type res struct {
+				id   int64
+				dist float64
+			}
+			allRes := make([]res, dsize)
+			for j := 0; j < dsize; j++ {
+				d, _ := metric.L2DistanceSq(queries[i], dataset[j])
+				allRes[j] = res{id: int64(j), dist: float64(d)}
+			}
+
+			// Sort by distance ascending, then ID ascending for stability
+			sort.Slice(allRes, func(a, b int) bool {
+				if allRes[a].dist == allRes[b].dist {
+					return allRes[a].id < allRes[b].id
+				}
+				return allRes[a].dist < allRes[b].dist
+			})
+
+			// Check top K
+			for j := 0; j < int(limit); j++ {
+				offset := i*int(limit) + j
+				expectedDist := allRes[j].dist
+				actualDist := dists[offset]
+
+				require.InDeltaf(t, expectedDist, actualDist, 1e-5, "Distance mismatch at query %d, rank %d (limit %d)", i, j, limit)
+			}
+
+			// Check that actual results are sorted
+			for j := 1; j < int(limit); j++ {
+				offset := i*int(limit) + j
+				require.Truef(t, dists[offset] >= dists[offset-1], "Results not sorted at query %d, rank %d", i, j)
+			}
+		}
+	}
+}
diff --git a/pkg/vectorindex/brute_force/cpu.go b/pkg/vectorindex/brute_force/cpu.go
index b60f8e5b68a4b..b5c65f96cf614 100644
--- a/pkg/vectorindex/brute_force/cpu.go
+++ b/pkg/vectorindex/brute_force/cpu.go
@@ -25,7 +25,8 @@ import (
 func NewBruteForceIndex[T types.RealNumbers](dataset [][]T,
 	dimension uint,
 	m metric.MetricType,
-	elemsz uint) (cache.VectorIndexSearchIf, error) {
+	elemsz uint,
+	nthread uint) (cache.VectorIndexSearchIf, error) {
 
 	return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz)
 }
diff --git a/pkg/vectorindex/brute_force/gpu.go b/pkg/vectorindex/brute_force/gpu.go
index 029c32ef152a1..505b305bfd4e3 100644
--- a/pkg/vectorindex/brute_force/gpu.go
+++ b/pkg/vectorindex/brute_force/gpu.go
@@ -17,90 +17,125 @@
 package brute_force
 
 import (
-	//	"fmt"
+	"github.com/matrixorigin/matrixone/pkg/common/malloc"
+	"github.com/matrixorigin/matrixone/pkg/common/util"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/cuvs"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/sqlexec"
-	cuvs "github.com/rapidsai/cuvs/go"
-	"github.com/rapidsai/cuvs/go/brute_force"
 )
 
-type GpuBruteForceIndex[T cuvs.TensorNumberType] struct {
-	Resource    *cuvs.Resource // shared resource for read-only index
-	Dataset     *cuvs.Tensor[T]
-	Index       *brute_force.BruteForceIndex
-	Metric      cuvs.Distance
-	Dimension   uint
-	Count       uint
-	ElementSize uint
+type GpuBruteForceIndex[T cuvs.VectorType] struct {
+	index     *cuvs.GpuBruteForce[T]
+	dimension uint
+	count     uint
 }
 
 var _ cache.VectorIndexSearchIf = &GpuBruteForceIndex[float32]{}
 
-// cuvs library has bug.  comment out the GPU version until cuvs fix the bug
+func resolveCuvsDistance(m metric.MetricType) cuvs.DistanceType {
+	switch m {
+	case metric.Metric_L2sqDistance:
+		return cuvs.L2Expanded
+	case metric.Metric_L2Distance:
+		return cuvs.L2Expanded
+	case metric.Metric_InnerProduct:
+		return cuvs.InnerProduct
+	case metric.Metric_CosineDistance:
+		return cuvs.CosineSimilarity
+	case metric.Metric_L1Distance:
+		return cuvs.L1
+	default:
+		return cuvs.L2Expanded
+	}
+}
+
 func NewBruteForceIndex[T types.RealNumbers](dataset [][]T,
 	dimension uint,
 	m metric.MetricType,
-	elemsz uint) (cache.VectorIndexSearchIf, error) {
+	elemsz uint,
+	nthread uint) (cache.VectorIndexSearchIf, error) {
 
 	switch dset := any(dataset).(type) {
 	case [][]float64:
 		return NewCpuBruteForceIndex[T](dataset, dimension, m, elemsz)
 	case [][]float32:
-		return NewCpuBruteForceIndex[float32](dset, dimension, m, elemsz)
-		//return NewGpuBruteForceIndex[float32](dset, dimension, m, elemsz)
+		return NewGpuBruteForceIndex[float32](dset, dimension, m, elemsz, nthread)
+	case [][]uint16:
+		// Convert [][]uint16 to [][]cuvs.Float16 to pass to NewGpuBruteForceIndex
+		f16dset := make([][]cuvs.Float16, len(dset))
+		for i, v := range dset {
+			f16dset[i] = util.UnsafeSliceCast[cuvs.Float16](v)
+		}
+		return NewGpuBruteForceIndex[cuvs.Float16](f16dset, dimension, m, elemsz, nthread)
 	default:
 		return nil, moerr.NewInternalErrorNoCtx("type not supported for BruteForceIndex")
 	}
-
 }
 
-func NewGpuBruteForceIndex[T cuvs.TensorNumberType](dataset [][]T,
+func NewGpuBruteForceIndex[T cuvs.VectorType](dataset [][]T,
 	dimension uint,
 	m metric.MetricType,
-	elemsz uint) (cache.VectorIndexSearchIf, error) {
+	elemsz uint,
+	nthread uint) (cache.VectorIndexSearchIf, error) {
 
-	idx := &GpuBruteForceIndex[T]{}
-	resource, _ := cuvs.NewResource(nil)
-	idx.Resource = &resource
-	tensor, err := cuvs.NewTensor(dataset)
-	if err != nil {
-		return nil, err
+	if len(dataset) == 0 {
+		return nil, moerr.NewInternalErrorNoCtx("empty dataset")
 	}
-	idx.Dataset = &tensor
-	idx.Metric = metric.MetricTypeToCuvsMetric[m]
-	idx.Dimension = dimension
-	idx.Count = uint(len(dataset))
-
-	idx.ElementSize = elemsz
-	return idx, nil
 
-}
+	dim := int(dimension)
+	reqSize := len(dataset) * dim
+	var flattened []T
 
-func (idx *GpuBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) (err error) {
-	if _, err = idx.Dataset.ToDevice(idx.Resource); err != nil {
-		return err
+	var _t T
+	switch any(_t).(type) {
+	case float32:
+		allocator := malloc.NewCAllocator()
+		slice, deallocator, err := allocator.Allocate(uint64(reqSize*4), malloc.NoClear)
+		if err != nil {
+			return nil, err
+		}
+		defer deallocator.Deallocate()
+		flattened = any(util.UnsafeSliceCast[float32](slice)).([]T)
+	case cuvs.Float16:
+		allocator := malloc.NewCAllocator()
+		slice, deallocator, err := allocator.Allocate(uint64(reqSize*2), malloc.NoClear)
+		if err != nil {
+			return nil, err
+		}
+		defer deallocator.Deallocate()
+		flattened = any(util.UnsafeSliceCast[cuvs.Float16](slice)).([]T)
+	default:
+		ds := make([]T, reqSize)
+		flattened = ds
 	}
 
-	idx.Index, err = brute_force.CreateIndex()
-	if err != nil {
-		return
+	for i, v := range dataset {
+		copy(flattened[i*dim:(i+1)*dim], v)
 	}
 
-	err = brute_force.BuildIndex[T](*idx.Resource, idx.Dataset, idx.Metric, 0, idx.Index)
+	deviceID := 0 // Default to device 0
+	km, err := cuvs.NewGpuBruteForce[T](flattened, uint64(len(dataset)), uint32(dimension), resolveCuvsDistance(m), uint32(nthread), deviceID)
 	if err != nil {
-		return
+		return nil, err
 	}
 
-	if err = idx.Resource.Sync(); err != nil {
-		return
-	}
+	return &GpuBruteForceIndex[T]{
+		index:     km,
+		dimension: dimension,
+		count:     uint(len(dataset)),
+	}, nil
+}
 
-	return
+func (idx *GpuBruteForceIndex[T]) Load(sqlproc *sqlexec.SqlProcess) (err error) {
+	if idx.index == nil {
+		return moerr.NewInternalErrorNoCtx("GpuBruteForce not initialized")
+	}
+	return idx.index.Load()
 }
 
 func (idx *GpuBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any, rt vectorindex.RuntimeConfig) (retkeys any, retdistances []float64, err error) {
@@ -109,77 +144,61 @@ func (idx *GpuBruteForceIndex[T]) Search(proc *sqlexec.SqlProcess, _queries any,
 		return nil, nil, moerr.NewInternalErrorNoCtx("queries type invalid")
 	}
 
-	// local resource for concurrent search
-	resource, err := cuvs.NewResource(nil)
-	if err != nil {
-		return nil, nil, err
+	if len(queriesvec) == 0 {
+		return nil, nil, nil
 	}
-	defer resource.Close()
 
-	queries, err := cuvs.NewTensor(queriesvec)
-	if err != nil {
-		return nil, nil, err
-	}
-	defer queries.Close()
+	dim := int(idx.dimension)
+	reqSize := len(queriesvec) * dim
 
-	neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(rt.Limit)})
-	if err != nil {
-		return nil, nil, err
-	}
-	defer neighbors.Close()
-
-	distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(rt.Limit)})
-	if err != nil {
-		return nil, nil, err
-	}
-	defer distances.Close()
-
-	if _, err = queries.ToDevice(&resource); err != nil {
-		return nil, nil, err
-	}
+	var flattenedQueries []T
+	var queryDeallocator malloc.Deallocator
 
-	err = brute_force.SearchIndex(resource, *idx.Index, &queries, &neighbors, &distances)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	if _, err = neighbors.ToHost(&resource); err != nil {
-		return nil, nil, err
-	}
-
-	if _, err = distances.ToHost(&resource); err != nil {
-		return nil, nil, err
+	var _t T
+	switch any(_t).(type) {
+	case float32:
+		allocator := malloc.NewCAllocator()
+		slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*4, malloc.NoClear)
+		if err2 != nil {
+			return nil, nil, err2
+		}
+		queryDeallocator = dealloc
+		f32Slice := util.UnsafeSliceCastToLength[float32](slice, reqSize)
+		flattenedQueries = any(f32Slice).([]T)
+	case cuvs.Float16:
+		allocator := malloc.NewCAllocator()
+		slice, dealloc, err2 := allocator.Allocate(uint64(reqSize)*2, malloc.NoClear)
+		if err2 != nil {
+			return nil, nil, err2
+		}
+		queryDeallocator = dealloc
+		f16Slice := util.UnsafeSliceCastToLength[cuvs.Float16](slice, reqSize)
+		flattenedQueries = any(f16Slice).([]T)
+	default:
+		// Not pooling other types, although T is likely only float32 for CUVS
+		ds := make([]T, reqSize)
+		flattenedQueries = ds
 	}
 
-	if err = resource.Sync(); err != nil {
-		return nil, nil, err
+	for i, v := range queriesvec {
+		copy(flattenedQueries[i*dim:(i+1)*dim], v)
 	}
 
-	neighborsSlice, err := neighbors.Slice()
-	if err != nil {
-		return nil, nil, err
+	if queryDeallocator != nil {
+		defer queryDeallocator.Deallocate()
 	}
 
-	distancesSlice, err := distances.Slice()
+	neighbors, distances, err := idx.index.Search(flattenedQueries, uint64(len(queriesvec)), uint32(idx.dimension), uint32(rt.Limit))
 	if err != nil {
 		return nil, nil, err
 	}
 
-	//fmt.Printf("flattened %v\n", flatten)
-	retdistances = make([]float64, len(distancesSlice)*int(rt.Limit))
-	for i := range distancesSlice {
-		for j, dist := range distancesSlice[i] {
-			retdistances[i*int(rt.Limit)+j] = float64(dist)
-		}
+	retdistances = make([]float64, len(distances))
+	for i, d := range distances {
+		retdistances[i] = float64(d)
 	}
 
-	keys := make([]int64, len(neighborsSlice)*int(rt.Limit))
-	for i := range neighborsSlice {
-		for j, key := range neighborsSlice[i] {
-			keys[i*int(rt.Limit)+j] = int64(key)
-		}
-	}
-	retkeys = keys
+	retkeys = neighbors
 	return
 }
 
@@ -188,13 +207,7 @@ func (idx *GpuBruteForceIndex[T]) UpdateConfig(sif cache.VectorIndexSearchIf) er
 }
 
 func (idx *GpuBruteForceIndex[T]) Destroy() {
-	if idx.Dataset != nil {
-		idx.Dataset.Close()
-	}
-	if idx.Resource != nil {
-		idx.Resource.Close()
-	}
-	if idx.Index != nil {
-		idx.Index.Close()
+	if idx.index != nil {
+		idx.index.Destroy()
 	}
 }
diff --git a/pkg/vectorindex/brute_force/gpu_benchmark_test.go b/pkg/vectorindex/brute_force/gpu_benchmark_test.go
new file mode 100644
index 0000000000000..1c7c9dbf20081
--- /dev/null
+++ b/pkg/vectorindex/brute_force/gpu_benchmark_test.go
@@ -0,0 +1,29 @@
+//go:build gpu
+
+// Copyright 2022 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package brute_force
+
+import (
+	"testing"
+)
+
+func BenchmarkGpuBruteForce(b *testing.B) {
+	benchmarkBruteForce(b, NewGpuBruteForceIndex[float32])
+}
+
+func BenchmarkCentroidSearchGpuBruteForce(b *testing.B) {
+	benchmarkCentroidSearch(b, NewGpuBruteForceIndex[float32])
+}
diff --git a/pkg/vectorindex/brute_force/gpu_test.go b/pkg/vectorindex/brute_force/gpu_test.go
index d9b024f5444cd..d1b341d797c21 100644
--- a/pkg/vectorindex/brute_force/gpu_test.go
+++ b/pkg/vectorindex/brute_force/gpu_test.go
@@ -17,7 +17,6 @@
 package brute_force
 
 import (
-	//"fmt"
 	"math/rand/v2"
 	"sync"
 	"testing"
@@ -35,22 +34,22 @@ func TestGpuBruteForce(t *testing.T) {
 	dataset := [][]float32{{1, 2, 3}, {3, 4, 5}}
 	query := [][]float32{{1, 2, 3}, {3, 4, 5}}
 	dimension := uint(3)
-	ncpu := uint(1)
+	ncpu := uint(8)
 	limit := uint(1)
 	elemsz := uint(4) // float32
 
-	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu)
 	require.NoError(t, err)
 	defer idx.Destroy()
 
 	err = idx.Load(nil)
 	require.NoError(t, err)
 
-	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+	rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 1}
 
 	var wg sync.WaitGroup
 
-	for n := 0; n < 4; n++ {
+	for n := 0; n < 8; n++ {
 
 		wg.Add(1)
 		go func() {
@@ -66,7 +65,6 @@ func TestGpuBruteForce(t *testing.T) {
 					require.Equal(t, key, int64(j))
 					require.Equal(t, distances[j], float64(0))
 				}
-				// fmt.Printf("keys %v, dist %v\n", keys, distances)
 			}
 		}()
 	}
@@ -81,7 +79,7 @@ func TestGpuBruteForceConcurrent(t *testing.T) {
 	proc := testutil.NewProcessWithMPool(t, "", m)
 	sqlproc := sqlexec.NewSqlProcess(proc)
 	dimension := uint(128)
-	ncpu := uint(4)
+	ncpu := uint(8)
 	limit := uint(3)
 	elemsz := uint(4) // float32
 
@@ -96,7 +94,7 @@ func TestGpuBruteForceConcurrent(t *testing.T) {
 
 	query := dataset
 
-	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz)
+	idx, err := NewGpuBruteForceIndex[float32](dataset, dimension, metric.Metric_L2sqDistance, elemsz, ncpu)
 	require.NoError(t, err)
 	defer idx.Destroy()
 
@@ -105,13 +103,12 @@ func TestGpuBruteForceConcurrent(t *testing.T) {
 
 	// limit 3
 	{
-		rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: ncpu}
+		rt := vectorindex.RuntimeConfig{Limit: limit, NThreads: 1}
 
 		anykeys, distances, err := idx.Search(sqlproc, query, rt)
 		require.NoError(t, err)
 
 		keys := anykeys.([]int64)
-		// fmt.Printf("keys %v, dist %v\n", keys, distances)
 		require.Equal(t, int(rt.Limit)*len(query), len(keys))
 		for i := range query {
 			offset := i * int(rt.Limit)
@@ -122,13 +119,12 @@ func TestGpuBruteForceConcurrent(t *testing.T) {
 
 	// limit 1
 	{
-		rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: ncpu}
+		rt := vectorindex.RuntimeConfig{Limit: 1, NThreads: 1}
 
 		anykeys, distances, err := idx.Search(sqlproc, query, rt)
 		require.NoError(t, err)
 
 		keys := anykeys.([]int64)
-		// fmt.Printf("keys %v, dist %v\n", keys, distances)
 		require.Equal(t, int(rt.Limit)*len(query), len(keys))
 		for i := range query {
 			offset := i * int(rt.Limit)
diff --git a/pkg/vectorindex/index.go b/pkg/vectorindex/index.go
index 496335863183f..ee63aff2145e9 100644
--- a/pkg/vectorindex/index.go
+++ b/pkg/vectorindex/index.go
@@ -19,6 +19,7 @@ import (
 	"crypto/md5"
 	"encoding/hex"
 	"fmt"
+	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"io"
 	"os"
 	"sync"
@@ -153,3 +154,122 @@ func (h *SearchResultSafeHeap) Pop() SearchResultIf {
 	x := heap.Pop(&h.resheap).(SearchResultIf)
 	return x
 }
+
+// FastMaxHeap is a highly optimized, generic bounded max-heap designed specifically for
+// vector search Top-K operations.
+//
+// Benefits over standard container/heap:
+//  1. Zero Interface Boxing: By using generics and specific array layouts, it completely avoids
+//     the heap-escape "boxing" allocations caused by passing interface{} around.
+//  2. Struct of Arrays (SoA): Uses independent slices for keys and distances rather than an
+//     Array of Structs (AoS). This dramatically improves CPU cache locality during distance
+//     comparisons.
+//  3. Inline Array Reuse: Requires passing pre-allocated backing buffers to ensure zero
+//     allocations inside tight loops.
+//  4. Bounded Logic: Natively handles "Limit/K" bounded sizing directly during the push step,
+//     reducing structural overhead.
+type FastMaxHeap[T types.RealNumbers] struct {
+	keys      []int64
+	distances []T
+	size      int
+	limit     int
+}
+
+// NewFastMaxHeap initializes the FastMaxHeap using caller-provided buffer slices
+// to guarantee zero-allocation operations during tight query loops.
+func NewFastMaxHeap[T types.RealNumbers](limit int, keysBuf []int64, distsBuf []T) *FastMaxHeap[T] {
+	return &FastMaxHeap[T]{
+		keys:      keysBuf,
+		distances: distsBuf,
+		size:      0,
+		limit:     limit,
+	}
+}
+
+func (h *FastMaxHeap[T]) siftUp(j int) {
+	for {
+		i := (j - 1) / 2 // parent
+		if i == j || h.distances[j] <= h.distances[i] {
+			break
+		}
+		h.distances[i], h.distances[j] = h.distances[j], h.distances[i]
+		h.keys[i], h.keys[j] = h.keys[j], h.keys[i]
+		j = i
+	}
+}
+
+func (h *FastMaxHeap[T]) siftDown(i0, n int) {
+	i := i0
+	for {
+		j1 := 2*i + 1
+		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
+			break
+		}
+		j := j1 // left child
+		if j2 := j1 + 1; j2 < n && h.distances[j2] > h.distances[j1] {
+			j = j2 // right child
+		}
+		if h.distances[j] <= h.distances[i] {
+			break
+		}
+		h.distances[i], h.distances[j] = h.distances[j], h.distances[i]
+		h.keys[i], h.keys[j] = h.keys[j], h.keys[i]
+		i = j
+	}
+}
+
+// Push inserts a new element into the max-heap. If the heap is at its limit,
+// it replaces the maximum (root) element if the new distance is smaller.
+func (h *FastMaxHeap[T]) Push(key int64, dist T) {
+	if h.size < h.limit {
+		h.distances[h.size] = dist
+		h.keys[h.size] = key
+		h.siftUp(h.size)
+		h.size++
+	} else if dist < h.distances[0] {
+		h.distances[0] = dist
+		h.keys[0] = key
+		h.siftDown(0, h.limit)
+	}
+}
+
+// Pop extracts the element with the largest distance from the max-heap.
+func (h *FastMaxHeap[T]) Pop() (int64, T, bool) {
+	if h.size == 0 {
+		return -1, 0, false
+	}
+	h.size--
+	key := h.keys[0]
+	dist := h.distances[0]
+
+	h.keys[0] = h.keys[h.size]
+	h.distances[0] = h.distances[h.size]
+	h.siftDown(0, h.size)
+
+	return key, dist, true
+}
+
+// Thread-safe wrapper for FastMaxHeap
+type FastMaxHeapSafe[T types.RealNumbers] struct {
+	mutex sync.Mutex
+	heap  *FastMaxHeap[T]
+}
+
+// NewFastMaxHeapSafe creates a thread-safe FastMaxHeap
+func NewFastMaxHeapSafe[T types.RealNumbers](limit int, keysBuf []int64, distsBuf []T) *FastMaxHeapSafe[T] {
+	return &FastMaxHeapSafe[T]{
+		heap: NewFastMaxHeap(limit, keysBuf, distsBuf),
+	}
+}
+
+func (s *FastMaxHeapSafe[T]) Push(key int64, dist T) {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	s.heap.Push(key, dist)
+}
+
+func (s *FastMaxHeapSafe[T]) Pop() (int64, T, bool) {
+	s.mutex.Lock()
+	defer s.mutex.Unlock()
+	return s.heap.Pop()
+}
diff --git a/pkg/vectorindex/index_test.go b/pkg/vectorindex/index_test.go
index bacbca116845b..788c903c03b30 100644
--- a/pkg/vectorindex/index_test.go
+++ b/pkg/vectorindex/index_test.go
@@ -211,3 +211,71 @@ func TestGetConcurrency(t *testing.T) {
 	require.Equal(t, int64(4), nthread)
 
 }
+
+func TestFastMaxHeap(t *testing.T) {
+	limit := 3
+	keysBuf := make([]int64, limit)
+	distsBuf := make([]float32, limit)
+
+	h := NewFastMaxHeap(limit, keysBuf, distsBuf)
+
+	// Add 5 items, we only want the 3 smallest distances
+	h.Push(10, float32(10.0))
+	h.Push(5, float32(5.0))
+	h.Push(20, float32(20.0))
+	h.Push(1, float32(1.0))
+	h.Push(8, float32(8.0))
+
+	// Expected distances in the heap (the 3 smallest): 1.0, 5.0, 8.0
+	// Because it is a max-heap of the minimums, popping should return the largest distance first: 8.0, 5.0, 1.0
+
+	key, dist, ok := h.Pop()
+	require.True(t, ok)
+	require.Equal(t, int64(8), key)
+	require.Equal(t, float32(8.0), dist)
+
+	key, dist, ok = h.Pop()
+	require.True(t, ok)
+	require.Equal(t, int64(5), key)
+	require.Equal(t, float32(5.0), dist)
+
+	key, dist, ok = h.Pop()
+	require.True(t, ok)
+	require.Equal(t, int64(1), key)
+	require.Equal(t, float32(1.0), dist)
+
+	_, _, ok = h.Pop()
+	require.False(t, ok)
+}
+
+func TestFastMaxHeapSafe(t *testing.T) {
+	limit := 5
+	keysBuf := make([]int64, limit)
+	distsBuf := make([]float32, limit)
+
+	h := NewFastMaxHeapSafe(limit, keysBuf, distsBuf)
+
+	var wg sync.WaitGroup
+	// Push 100 elements concurrently. The 5 smallest should be 0, 1, 2, 3, 4
+	for i := 0; i < 100; i++ {
+		wg.Add(1)
+		go func(val int) {
+			defer wg.Done()
+			h.Push(int64(val), float32(val))
+		}(i)
+	}
+
+	wg.Wait()
+
+	// Because it's a bounded max-heap holding the K smallest distances,
+	// popping should yield the largest of the top 5 first: 4, 3, 2, 1, 0
+	for expected := 4; expected >= 0; expected-- {
+		key, dist, ok := h.Pop()
+		require.True(t, ok)
+		require.Equal(t, int64(expected), key)
+		require.Equal(t, float32(expected), dist)
+	}
+
+	_, _, ok := h.Pop()
+	require.False(t, ok)
+}
diff --git a/pkg/vectorindex/ivfflat/kmeans/balanced/balanced.go b/pkg/vectorindex/ivfflat/kmeans/balanced/balanced.go
new file mode 100644
index 0000000000000..a0ce6f38961dc
--- /dev/null
+++ b/pkg/vectorindex/ivfflat/kmeans/balanced/balanced.go
@@ -0,0 +1,410 @@
+// Copyright 2024 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package balanced
+
+import (
+	"context"
+	"math"
+	"math/rand/v2"
+	"runtime"
+	"slices"
+
+	"github.com/matrixorigin/matrixone/pkg/common/concurrent"
+	"github.com/matrixorigin/matrixone/pkg/common/malloc"
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/common/util"
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+)
+
+type BalancedKMeans[T types.RealNumbers] struct {
+	vectorList    [][]T
+	clusterCnt    int
+	maxIterations int
+	distFn        metric.DistanceFunction[T]
+	normalize     bool
+	nworker       int
+
+	centroids   [][]T
+	assignments []int
+
+	// pre-allocated buffers
+	indices     []int
+	c1          []T
+	c2          []T
+	diffs       []pointDiff
+	localAssign []int
+
+	deallocators []malloc.Deallocator
+}
+
+var _ kmeans.Clusterer = new(BalancedKMeans[float32])
+
+func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
+	maxIterations int, deltaThreshold float64,
+	distanceType metric.MetricType, initType kmeans.InitType,
+	spherical bool,
+	nworker int,
+) (kmeans.Clusterer, error) {
+
+	err := validateArgs[T](vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType)
+	if err != nil {
+		return nil, err
+	}
+
+	distanceFunction, normalize, err := metric.ResolveKmeansDistanceFn[T](distanceType, spherical)
+	if err != nil {
+		return nil, err
+	}
+
+	if nworker <= 0 {
+		nworker = runtime.NumCPU()
+	}
+
+	allocator := malloc.NewCAllocator()
+	var deallocators []malloc.Deallocator
+
+	allocSlice := func(size uint64) []byte {
+		slice, deallocator, err := allocator.Allocate(size, malloc.NoClear)
+		if err != nil {
+			panic(err) // OOM
+		}
+		deallocators = append(deallocators, deallocator)
+		return slice
+	}
+
+	dim := len(vectors[0])
+	numVectors := len(vectors)
+
+	// allocate centroids (outer slice + inner slices)
+	centroidsBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[[]T]()))
+	centroids := util.UnsafeSliceCastToLength[[]T](centroidsBytes, clusterCnt)
+	for i := range centroids {
+		innerBytes := allocSlice(uint64(dim) * uint64(util.UnsafeSizeOf[T]()))
+		centroids[i] = util.UnsafeSliceCastToLength[T](innerBytes, dim)
+	}
+
+	// allocate assignments
+	assignmentsBytes := allocSlice(uint64(numVectors) * uint64(util.UnsafeSizeOf[int]()))
+	assignments := util.UnsafeSliceCastToLength[int](assignmentsBytes, numVectors)
+
+	// allocate indices
+	indicesBytes := allocSlice(uint64(numVectors) * uint64(util.UnsafeSizeOf[int]()))
+	indices := util.UnsafeSliceCastToLength[int](indicesBytes, numVectors)
+
+	// allocate c1, c2
+	c1Bytes := allocSlice(uint64(dim) * uint64(util.UnsafeSizeOf[T]()))
+	c1 := util.UnsafeSliceCastToLength[T](c1Bytes, dim)
+	c2Bytes := allocSlice(uint64(dim) * uint64(util.UnsafeSizeOf[T]()))
+	c2 := util.UnsafeSliceCastToLength[T](c2Bytes, dim)
+
+	// allocate diffs
+	diffsBytes := allocSlice(uint64(numVectors) * uint64(util.UnsafeSizeOf[pointDiff]()))
+	diffs := util.UnsafeSliceCastToLength[pointDiff](diffsBytes, numVectors)
+
+	// allocate localAssign
+	localAssignBytes := allocSlice(uint64(numVectors) * uint64(util.UnsafeSizeOf[int]()))
+	localAssign := util.UnsafeSliceCastToLength[int](localAssignBytes, numVectors)
+
+	return &BalancedKMeans[T]{
+		vectorList:    vectors,
+		clusterCnt:    clusterCnt,
+		maxIterations: maxIterations,
+		distFn:        distanceFunction,
+		normalize:     normalize,
+		nworker:       nworker,
+		centroids:     centroids,
+		assignments:   assignments,
+		indices:       indices,
+		c1:            c1,
+		c2:            c2,
+		diffs:         diffs,
+		localAssign:   localAssign,
+		deallocators:  deallocators,
+	}, nil
+}
+
+func validateArgs[T types.RealNumbers](vectorList [][]T, clusterCnt,
+	maxIterations int, deltaThreshold float64,
+	distanceType metric.MetricType, initType kmeans.InitType) error {
+	if len(vectorList) == 0 || len(vectorList[0]) == 0 {
+		return moerr.NewInternalErrorNoCtx("input vectors is empty")
+	}
+	if clusterCnt > len(vectorList) {
+		return moerr.NewInternalErrorNoCtxf("cluster count is larger than vector count %d > %d", clusterCnt, len(vectorList))
+	}
+	if maxIterations < 0 {
+		return moerr.NewInternalErrorNoCtxf("max iteration is out of bounds (must be >= 0)")
+	}
+	if distanceType >= metric.Metric_TypeCount {
+		return moerr.NewInternalErrorNoCtx("distance type is not supported")
+	}
+
+	vlen := -1
+	for _, v := range vectorList {
+		if vlen == -1 {
+			vlen = len(v)
+		}
+		if vlen != len(v) {
+			return moerr.NewInternalErrorNoCtx("input vectors not in same dimension")
+		}
+	}
+	return nil
+}
+
+func (km *BalancedKMeans[T]) InitCentroids(ctx context.Context) error {
+	// For balanced divisive k-means, initialization is inherently part of the clustering process.
+	return nil
+}
+
+func (km *BalancedKMeans[T]) Close() error {
+	for _, d := range km.deallocators {
+		d.Deallocate()
+	}
+	km.deallocators = nil
+	return nil
+}
+
+type pointDiff struct {
+	index int
+	diff  float64
+}
+
+func (km *BalancedKMeans[T]) Cluster(ctx context.Context) (any, error) {
+	if km.normalize {
+		for i := range km.vectorList {
+			metric.NormalizeL2(km.vectorList[i], km.vectorList[i])
+		}
+	}
+
+	if len(km.vectorList) == km.clusterCnt {
+		for i := 0; i < km.clusterCnt; i++ {
+			copy(km.centroids[i], km.vectorList[i])
+			km.assignments[i] = i
+		}
+		return km.centroids, nil
+	}
+
+	for i := range km.indices {
+		km.indices[i] = i
+	}
+
+	rnd := rand.New(rand.NewPCG(uint64(kmeans.DefaultRandSeed), 0))
+
+	exec := concurrent.NewThreadPoolExecutor(km.nworker)
+	err := km.bisectBalanced(ctx, km.indices, km.clusterCnt, 0, exec, km.c1, km.c2, km.diffs, km.localAssign, rnd)
+	if err != nil {
+		return nil, err
+	}
+
+	return km.centroids, nil
+}
+
+func (km *BalancedKMeans[T]) bisectBalanced(
+	ctx context.Context,
+	indices []int,
+	k int,
+	clusterStart int,
+	exec concurrent.ThreadPoolExecutor,
+	c1, c2 []T,
+	diffs []pointDiff,
+	localAssign []int,
+	rnd *rand.Rand,
+) error {
+	if k == 1 {
+		computeMeanFromIndicesInPlace(km.vectorList, indices, km.centroids[clusterStart])
+		if km.normalize {
+			metric.NormalizeL2(km.centroids[clusterStart], km.centroids[clusterStart])
+		}
+		for _, idx := range indices {
+			km.assignments[idx] = clusterStart
+		}
+		return nil
+	}
+
+	n := len(indices)
+	k1 := k / 2
+	k2 := k - k1
+
+	// Proportion of data
+	n1 := int((int64(n) * int64(k1)) / int64(k))
+	if n1 == 0 {
+		n1 = 1
+	}
+	if n1 == n {
+		n1 = n - 1
+	}
+
+	// Random initial centers for the bisection
+	idx1 := rnd.IntN(n)
+	idx2 := rnd.IntN(n)
+	for idx1 == idx2 && n > 1 {
+		idx2 = rnd.IntN(n)
+	}
+	copy(c1, km.vectorList[indices[idx1]])
+	copy(c2, km.vectorList[indices[idx2]])
+
+	// use slices for this level of recursion
+	curDiffs := diffs[:n]
+	curAssign := localAssign[:n]
+
+	// Create the worker function once outside the iteration loop to avoid allocating closures
+	workerFn := func(ctx context.Context, thread_id int, start, end int) error {
+		for i := start; i < end; i++ {
+			if (i-start)%100 == 0 && ctx.Err() != nil {
+				return ctx.Err()
+			}
+			vIdx := indices[i]
+			d1, err1 := km.distFn(km.vectorList[vIdx], c1)
+			if err1 != nil {
+				return err1
+			}
+			d2, err2 := km.distFn(km.vectorList[vIdx], c2)
+			if err2 != nil {
+				return err2
+			}
+			// diff < 0 means closer to c1
+			curDiffs[i] = pointDiff{index: i, diff: float64(d1) - float64(d2)}
+		}
+		return nil
+	}
+
+	for iter := 0; iter < km.maxIterations; iter++ {
+		err := exec.Execute(ctx, n, workerFn)
+		if err != nil {
+			return err
+		}
+
+		slices.SortFunc(curDiffs, func(a, b pointDiff) int {
+			if a.diff < b.diff {
+				return -1
+			} else if a.diff > b.diff {
+				return 1
+			}
+			return 0
+		})
+
+		changed := false
+		for i := 0; i < n1; i++ {
+			localIdx := curDiffs[i].index
+			if iter == 0 || curAssign[localIdx] != 0 {
+				curAssign[localIdx] = 0
+				changed = true
+			}
+		}
+		for i := n1; i < n; i++ {
+			localIdx := curDiffs[i].index
+			if iter == 0 || curAssign[localIdx] != 1 {
+				curAssign[localIdx] = 1
+				changed = true
+			}
+		}
+
+		if !changed && iter > 0 {
+			break
+		}
+
+		computeMeanFromIndicesAndAssignInPlace(km.vectorList, indices, curAssign, 0, c1)
+		computeMeanFromIndicesAndAssignInPlace(km.vectorList, indices, curAssign, 1, c2)
+		if km.normalize {
+			metric.NormalizeL2(c1, c1)
+			metric.NormalizeL2(c2, c2)
+		}
+	}
+
+	// In-place partition of indices based on curAssign
+	left, right := 0, n-1
+	for left <= right {
+		for left <= right && curAssign[left] == 0 {
+			left++
+		}
+		for left <= right && curAssign[right] == 1 {
+			right--
+		}
+		if left < right {
+			indices[left], indices[right] = indices[right], indices[left]
+			curAssign[left], curAssign[right] = curAssign[right], curAssign[left]
+			left++
+			right--
+		}
+	}
+
+	// We can reuse the buffers for the child calls since they are sequential
+	err := km.bisectBalanced(ctx, indices[:n1], k1, clusterStart, exec, c1, c2, diffs, localAssign, rnd)
+	if err != nil {
+		return err
+	}
+
+	err = km.bisectBalanced(ctx, indices[n1:], k2, clusterStart+k1, exec, c1, c2, diffs, localAssign, rnd)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func computeMeanFromIndicesAndAssignInPlace[T types.RealNumbers](data [][]T, indices []int, assignments []int, target int, out []T) {
+	dim := len(out)
+	for j := 0; j < dim; j++ {
+		out[j] = 0
+	}
+	count := 0
+	for i, a := range assignments {
+		if a == target {
+			vIdx := indices[i]
+			for j := 0; j < dim; j++ {
+				out[j] += data[vIdx][j]
+			}
+			count++
+		}
+	}
+	if count > 0 {
+		for j := 0; j < dim; j++ {
+			out[j] /= T(count)
+		}
+	}
+}
+
+func computeMeanFromIndicesInPlace[T types.RealNumbers](data [][]T, indices []int, out []T) {
+	if len(indices) == 0 {
+		return
+	}
+	dim := len(out)
+	for j := 0; j < dim; j++ {
+		out[j] = 0
+	}
+	for _, vIdx := range indices {
+		for j := 0; j < dim; j++ {
+			out[j] += data[vIdx][j]
+		}
+	}
+	for j := 0; j < dim; j++ {
+		out[j] /= T(len(indices))
+	}
+}
+
+// SSE returns the sum of squared errors.
+func (km *BalancedKMeans[T]) SSE() (float64, error) {
+	sse := 0.0
+	for i := range km.vectorList {
+		distErr, err := km.distFn(km.vectorList[i], km.centroids[km.assignments[i]])
+		if err != nil {
+			return 0, err
+		}
+		sse += math.Pow(float64(distErr), 2)
+	}
+	return sse, nil
+}
diff --git a/pkg/vectorindex/ivfflat/kmeans/balanced/balanced_test.go b/pkg/vectorindex/ivfflat/kmeans/balanced/balanced_test.go
new file mode 100644
index 0000000000000..397a21942728c
--- /dev/null
+++ b/pkg/vectorindex/ivfflat/kmeans/balanced/balanced_test.go
@@ -0,0 +1,203 @@
+// Copyright 2024 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package balanced
+
+import (
+	"context"
+	"fmt"
+	"math"
+	"math/rand/v2"
+	"testing"
+
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
+	"github.com/stretchr/testify/require"
+)
+
+func TestNewKMeans_Validation(t *testing.T) {
+	vectors := [][]float32{{1, 2}, {3, 4}, {5, 6}}
+
+	// Valid
+	_, err := NewKMeans(vectors, 2, 10, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 1)
+	require.NoError(t, err)
+
+	// Cluster count too high
+	_, err = NewKMeans(vectors, 4, 10, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 1)
+	require.Error(t, err)
+
+	// Dimension mismatch
+	mismatch := [][]float32{{1, 2}, {3, 4, 5}}
+	_, err = NewKMeans(mismatch, 2, 10, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 1)
+	require.Error(t, err)
+
+	// Empty vectors
+	_, err = NewKMeans([][]float32{}, 2, 10, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 1)
+	require.Error(t, err)
+}
+
+func TestBalancedKMeans_Basic(t *testing.T) {
+	ctx := context.Background()
+	// 8 points in 2D
+	vectors := [][]float32{
+		{1, 1}, {1.1, 1.1}, {0.9, 0.9}, {1, 0.9},
+		{10, 10}, {10.1, 10.1}, {9.9, 9.9}, {10, 9.9},
+	}
+
+	km, err := NewKMeans(vectors, 2, 10, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 2)
+	require.NoError(t, err)
+
+	res, err := km.Cluster(ctx)
+	require.NoError(t, err)
+
+	centroids := res.([][]float32)
+	require.Equal(t, 2, len(centroids))
+
+	// Verify assignments
+	bkm := km.(*BalancedKMeans[float32])
+	counts := make(map[int]int)
+	for _, a := range bkm.assignments {
+		counts[a]++
+	}
+
+	// Should be perfectly balanced: 4 points each
+	require.Equal(t, 2, len(counts))
+	require.Equal(t, 4, counts[0])
+	require.Equal(t, 4, counts[1])
+
+	sse, err := km.SSE()
+	require.NoError(t, err)
+	require.True(t, sse > 0)
+}
+
+func TestBalancedKMeans_K1(t *testing.T) {
+	ctx := context.Background()
+	vectors := [][]float32{{1, 1}, {2, 2}, {3, 3}}
+	km, err := NewKMeans(vectors, 1, 10, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 1)
+	require.NoError(t, err)
+
+	res, err := km.Cluster(ctx)
+	require.NoError(t, err)
+	centroids := res.([][]float32)
+	require.Equal(t, 1, len(centroids))
+	require.InDelta(t, 2.0, centroids[0][0], 1e-6)
+}
+
+func TestBalancedKMeans_KN(t *testing.T) {
+	ctx := context.Background()
+	vectors := [][]float32{{1, 1}, {2, 2}, {3, 3}}
+	km, err := NewKMeans(vectors, 3, 10, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 1)
+	require.NoError(t, err)
+
+	res, err := km.Cluster(ctx)
+	require.NoError(t, err)
+	centroids := res.([][]float32)
+	require.Equal(t, 3, len(centroids))
+}
+
+func TestBalancedKMeans_Spherical(t *testing.T) {
+	ctx := context.Background()
+	// Vectors on unit circle
+	vectors := [][]float32{
+		{1, 0}, {0.99, 0.1},
+		{0, 1}, {0.1, 0.99},
+	}
+	km, err := NewKMeans(vectors, 2, 10, 0.01, metric.Metric_CosineDistance, kmeans.Random, true, 1)
+	require.NoError(t, err)
+
+	res, err := km.Cluster(ctx)
+	require.NoError(t, err)
+	centroids := res.([][]float32)
+
+	// Check if centroids are normalized
+	for _, c := range centroids {
+		norm := float32(0)
+		for _, v := range c {
+			norm += v * v
+		}
+		require.InDelta(t, 1.0, math.Sqrt(float64(norm)), 1e-5)
+	}
+}
+
+func FakeErrorDistance[T types.RealNumbers](v1, v2 []T) (T, error) {
+	return 0, moerr.NewInternalErrorNoCtx("distance calculation failed")
+}
+
+func TestBalancedKMeans_DistanceError(t *testing.T) {
+	ctx := context.Background()
+	vectors := [][]float32{{1, 1}, {2, 2}, {3, 3}, {4, 4}}
+	km, err := NewKMeans(vectors, 2, 10, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 1)
+	require.NoError(t, err)
+
+	bkm := km.(*BalancedKMeans[float32])
+	bkm.distFn = FakeErrorDistance[float32]
+
+	_, err = km.Cluster(ctx)
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "distance calculation failed")
+}
+
+func TestBalancedKMeans_LargeBalanced(t *testing.T) {
+	ctx := context.Background()
+	n := 1000
+	k := 10
+	dim := 16
+	vectors := make([][]float32, n)
+	for i := 0; i < n; i++ {
+		vectors[i] = make([]float32, dim)
+		for j := 0; j < dim; j++ {
+			vectors[i][j] = float32(i % (j + 1))
+		}
+	}
+
+	km, err := NewKMeans(vectors, k, 20, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 8)
+	require.NoError(t, err)
+
+	_, err = km.Cluster(ctx)
+	require.NoError(t, err)
+
+	bkm := km.(*BalancedKMeans[float32])
+	counts := make(map[int]int)
+	for _, a := range bkm.assignments {
+		counts[a]++
+	}
+
+	require.Equal(t, k, len(counts))
+	for i := 0; i < k; i++ {
+		// 1000 / 10 = 100 per cluster
+		require.Equal(t, 100, counts[i], fmt.Sprintf("Cluster %d is not balanced", i))
+	}
+}
+
+func BenchmarkBalancedKMeans(b *testing.B) {
+	ctx := context.Background()
+	n := 10000
+	k := 100
+	dim := 128
+	vectors := make([][]float32, n)
+	for i := 0; i < n; i++ {
+		vectors[i] = make([]float32, dim)
+		for j := 0; j < dim; j++ {
+			vectors[i][j] = rand.Float32()
+		}
+	}
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		km, _ := NewKMeans(vectors, k, 15, 0.01, metric.Metric_L2Distance, kmeans.Random, false, 8)
+		_, _ = km.Cluster(ctx)
+	}
+}
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/cpu.go b/pkg/vectorindex/ivfflat/kmeans/device/cpu.go
index 0a26d3ca4a1bc..4e57b136823fb 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/cpu.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/cpu.go
@@ -19,7 +19,7 @@ package device
 import (
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans"
-	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/elkans"
+	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/balanced"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
 )
 
@@ -29,5 +29,5 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
 	spherical bool,
 	nworker int,
 ) (kmeans.Clusterer, error) {
-	return elkans.NewKMeans(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType, spherical, nworker)
+	return balanced.NewKMeans(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType, spherical, nworker)
 }
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/gpu.go b/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
index ed7eecfd58cf9..6d08bb7ea1f57 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/gpu.go
@@ -17,84 +17,48 @@
 package device
 
 import (
-	//"os"
-
 	"context"
 
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
+	"github.com/matrixorigin/matrixone/pkg/cuvs"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans/elkans"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/metric"
-	cuvs "github.com/rapidsai/cuvs/go"
-	"github.com/rapidsai/cuvs/go/ivf_flat"
 )
 
-type GpuClusterer[T cuvs.TensorNumberType] struct {
-	indexParams *ivf_flat.IndexParams
-	nlist       int
-	dim         int
-	vectors     [][]T
+type GpuClusterer[T cuvs.VectorType] struct {
+	kmeans  *cuvs.GpuKMeans[T]
+	nlist   int
+	dim     int
+	vectors []T
 }
 
 func (c *GpuClusterer[T]) InitCentroids(ctx context.Context) error {
-
 	return nil
 }
 
 func (c *GpuClusterer[T]) Cluster(ctx context.Context) (any, error) {
-
-	resource, err := cuvs.NewResource(nil)
-	if err != nil {
-		return nil, err
+	if c.kmeans == nil {
+		return nil, moerr.NewInternalErrorNoCtx("GpuKMeans not initialized")
 	}
-	defer resource.Close()
 
-	dataset, err := cuvs.NewTensor(c.vectors)
+	nSamples := uint64(len(c.vectors) / c.dim)
+	_, _, err := c.kmeans.Fit(c.vectors, nSamples)
 	if err != nil {
 		return nil, err
 	}
-	defer dataset.Close()
 
-	index, err := ivf_flat.CreateIndex(c.indexParams, &dataset)
+	centroids, err := c.kmeans.GetCentroids()
 	if err != nil {
 		return nil, err
 	}
-	defer index.Close()
 
-	if _, err := dataset.ToDevice(&resource); err != nil {
-		return nil, err
-	}
-
-	centers, err := cuvs.NewTensorOnDevice[T](&resource, []int64{int64(c.nlist), int64(c.dim)})
-	if err != nil {
-		return nil, err
-	}
-	defer centers.Close()
-
-	if err := ivf_flat.BuildIndex(resource, c.indexParams, &dataset, index); err != nil {
-		return nil, err
-	}
-
-	if err := resource.Sync(); err != nil {
-		return nil, err
-	}
-
-	if err := ivf_flat.GetCenters(index, &centers); err != nil {
-		return nil, err
-	}
-
-	if _, err := centers.ToHost(&resource); err != nil {
-		return nil, err
-	}
-
-	if err := resource.Sync(); err != nil {
-		return nil, err
-	}
-
-	result, err := centers.Slice()
-	if err != nil {
-		return nil, err
+	// Reshape centroids back to [][]T
+	result := make([][]T, c.nlist)
+	for i := 0; i < c.nlist; i++ {
+		result[i] = make([]T, c.dim)
+		copy(result[i], centroids[i*c.dim:(i+1)*c.dim])
 	}
 
 	return result, nil
@@ -105,26 +69,26 @@ func (c *GpuClusterer[T]) SSE() (float64, error) {
 }
 
 func (c *GpuClusterer[T]) Close() error {
-	if c.indexParams != nil {
-		c.indexParams.Close()
+	if c.kmeans != nil {
+		return c.kmeans.Destroy()
 	}
 	return nil
 }
 
-func resolveCuvsDistanceForDense(distance metric.MetricType) cuvs.Distance {
+func resolveCuvsDistanceForDense(distance metric.MetricType) cuvs.DistanceType {
 	switch distance {
 	case metric.Metric_L2sqDistance:
-		return cuvs.DistanceL2
+		return cuvs.L2Expanded
 	case metric.Metric_L2Distance:
-		return cuvs.DistanceL2
+		return cuvs.L2Expanded
 	case metric.Metric_InnerProduct:
-		return cuvs.DistanceL2
+		return cuvs.InnerProduct
 	case metric.Metric_CosineDistance:
-		return cuvs.DistanceL2
+		return cuvs.CosineSimilarity
 	case metric.Metric_L1Distance:
-		return cuvs.DistanceL2
+		return cuvs.L1
 	default:
-		return cuvs.DistanceL2
+		return cuvs.L2Expanded
 	}
 }
 
@@ -136,27 +100,35 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
 
 	switch vecs := any(vectors).(type) {
 	case [][]float32:
-
-		c := &GpuClusterer[float32]{}
-		c.nlist = clusterCnt
-		if len(vectors) == 0 {
+		if len(vecs) == 0 {
 			return nil, moerr.NewInternalErrorNoCtx("empty dataset")
 		}
-		c.vectors = vecs
-		c.dim = len(vecs[0])
 
-		indexParams, err := ivf_flat.CreateIndexParams()
+		dim := len(vecs[0])
+		// Flatten vectors for pkg/cuvs
+		flattened := make([]float32, len(vecs)*dim)
+		for i, v := range vecs {
+			copy(flattened[i*dim:(i+1)*dim], v)
+		}
+
+		// cuVS K-Means is currently single-GPU focused in our wrapper
+		deviceID := 0
+		nthread := uint32(1)
+
+		km, err := cuvs.NewGpuKMeans[float32](uint32(clusterCnt), uint32(dim), resolveCuvsDistanceForDense(distanceType), maxIterations, deviceID, nthread)
 		if err != nil {
 			return nil, err
 		}
-		indexParams.SetNLists(uint32(clusterCnt))
-		indexParams.SetMetric(resolveCuvsDistanceForDense(distanceType))
-		indexParams.SetKMeansNIters(uint32(maxIterations))
-		indexParams.SetKMeansTrainsetFraction(1) // train all sample
-		c.indexParams = indexParams
+
+		c := &GpuClusterer[float32]{
+			kmeans:  km,
+			nlist:   clusterCnt,
+			dim:     dim,
+			vectors: flattened,
+		}
 		return c, nil
+
 	default:
 		return elkans.NewKMeans(vectors, clusterCnt, maxIterations, deltaThreshold, distanceType, initType, spherical, nworker)
-
 	}
 }
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go b/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
index 1132ef924c17b..72fe4108ca9c7 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/gpu_test.go
@@ -17,8 +17,8 @@
 package device
 
 import (
-	//"fmt"
 	"context"
+	//"fmt"
 	"math/rand/v2"
 	"sync"
 	"testing"
@@ -33,7 +33,7 @@ import (
 )
 
 func TestGpu(t *testing.T) {
-
+	ctx := context.Background()
 	dim := 128
 	dsize := 1024
 	nlist := 128
@@ -48,7 +48,11 @@ func TestGpu(t *testing.T) {
 	c, err := NewKMeans[float32](vecs, nlist, 10, 0, metric.Metric_L2Distance, 0, false, 0)
 	require.NoError(t, err)
 
-	centers, err := c.Cluster(context.Background())
+	defer c.Close()
+
+	c.InitCentroids(ctx)
+
+	centers, err := c.Cluster(ctx)
 	require.NoError(t, err)
 
 	_, ok := centers.([][]float32)
@@ -63,6 +67,7 @@ func TestGpu(t *testing.T) {
 
 func TestIVFAndBruteForce(t *testing.T) {
 
+	ctx := context.Background()
 	m := mpool.MustNewZero()
 	proc := testutil.NewProcessWithMPool(t, "", m)
 	sqlproc := sqlexec.NewSqlProcess(proc)
@@ -83,8 +88,10 @@ func TestIVFAndBruteForce(t *testing.T) {
 
 	c, err := NewKMeans[float32](vecs, nlist, 10, 0, metric.Metric_L2Distance, 0, false, 0)
 	require.NoError(t, err)
+	defer c.Close()
 
-	centers, err := c.Cluster(context.Background())
+	c.InitCentroids(ctx)
+	centers, err := c.Cluster(ctx)
 	require.NoError(t, err)
 
 	centroids, ok := centers.([][]float32)
@@ -97,7 +104,7 @@ func TestIVFAndBruteForce(t *testing.T) {
 	*/
 
 	queries := vecs[:8192]
-	idx, err := mobf.NewBruteForceIndex[float32](centroids, dimension, metric.Metric_L2sqDistance, elemsz)
+	idx, err := mobf.NewBruteForceIndex[float32](centroids, dimension, metric.Metric_L2sqDistance, elemsz, ncpu)
 	require.NoError(t, err)
 	defer idx.Destroy()
 
@@ -116,21 +123,9 @@ func TestIVFAndBruteForce(t *testing.T) {
 			for i := 0; i < 1000; i++ {
 				_, _, err := idx.Search(sqlproc, queries, rt)
 				require.NoError(t, err)
-				/*
-
-					keys_i64, ok := keys.([]int64)
-					require.Equal(t, ok, true)
-
-					for j, key := range keys_i64 {
-						require.Equal(t, key, int64(j))
-						require.Equal(t, distances[j], float64(0))
-					}
-				*/
-				// fmt.Printf("keys %v, dist %v\n", keys, distances)
 			}
 		}()
 	}
 
 	wg.Wait()
-
 }
diff --git a/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go b/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go
index 17d89be59a97a..b6c614b5d6253 100644
--- a/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/device/issue_test.go
@@ -17,248 +17,170 @@
 package device
 
 import (
-	//"fmt"
+	"fmt"
 	"math/rand/v2"
+	"runtime"
 	"sync"
 	"testing"
-	//"os"
 
+	"github.com/matrixorigin/matrixone/pkg/cuvs"
 	"github.com/stretchr/testify/require"
-
-	cuvs "github.com/rapidsai/cuvs/go"
-	"github.com/rapidsai/cuvs/go/brute_force"
-	"github.com/rapidsai/cuvs/go/ivf_flat"
 )
 
-func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.Distance, maxIterations int) ([][]float32, error) {
-
-	resource, err := cuvs.NewResource(nil)
-	if err != nil {
-		return nil, err
+func getCenters(vecs [][]float32, dim int, clusterCnt int, distanceType cuvs.DistanceType, maxIterations int) ([][]float32, error) {
+	if len(vecs) == 0 {
+		return nil, fmt.Errorf("empty dataset")
 	}
-	defer resource.Close()
 
-	indexParams, err := ivf_flat.CreateIndexParams()
-	if err != nil {
-		return nil, err
+	// Flatten vectors
+	flattened := make([]float32, len(vecs)*dim)
+	for i, v := range vecs {
+		copy(flattened[i*dim:(i+1)*dim], v)
 	}
-	defer indexParams.Close()
-
-	indexParams.SetNLists(uint32(clusterCnt))
-	indexParams.SetMetric(distanceType)
-	indexParams.SetKMeansNIters(uint32(maxIterations))
-	indexParams.SetKMeansTrainsetFraction(1) // train all sample
 
-	dataset, err := cuvs.NewTensor(vecs)
+	deviceID := 0
+	nthread := uint32(1)
+	km, err := cuvs.NewGpuKMeans[float32](uint32(clusterCnt), uint32(dim), distanceType, maxIterations, deviceID, nthread)
 	if err != nil {
 		return nil, err
 	}
-	defer dataset.Close()
-
-	index, _ := ivf_flat.CreateIndex(indexParams, &dataset)
-	defer index.Close()
-
-	if _, err := dataset.ToDevice(&resource); err != nil {
-		return nil, err
-	}
+	defer km.Destroy()
 
-	centers, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(clusterCnt), int64(dim)})
+	_, _, err = km.Fit(flattened, uint64(len(vecs)))
 	if err != nil {
 		return nil, err
 	}
 
-	if err := ivf_flat.BuildIndex(resource, indexParams, &dataset, index); err != nil {
-		return nil, err
-	}
-
-	if err := resource.Sync(); err != nil {
-		return nil, err
-	}
-
-	if err := ivf_flat.GetCenters(index, &centers); err != nil {
-		return nil, err
-	}
-
-	if _, err := centers.ToHost(&resource); err != nil {
-		return nil, err
-	}
-
-	if err := resource.Sync(); err != nil {
+	centroids, err := km.GetCentroids()
+	if err != nil {
 		return nil, err
 	}
 
-	result, err := centers.Slice()
-	if err != nil {
-		return nil, err
+	// Reshape centroids
+	result := make([][]float32, clusterCnt)
+	for i := 0; i < clusterCnt; i++ {
+		result[i] = make([]float32, dim)
+		copy(result[i], centroids[i*dim:(i+1)*dim])
 	}
 
 	return result, nil
-
 }
 
-func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.Distance) (retkeys any, retdistances []float64, err error) {
-	//os.Stderr.WriteString(fmt.Sprintf("probe set %d\n", len(queriesvec)))
-	//os.Stderr.WriteString("brute force index search start\n")
-
-	resource, err := cuvs.NewResource(nil)
-	if err != nil {
-		return
+func Search(datasetvec [][]float32, queriesvec [][]float32, limit uint, distanceType cuvs.DistanceType) (retkeys any, retdistances []float64, err error) {
+	if len(datasetvec) == 0 || len(queriesvec) == 0 {
+		return nil, nil, nil
 	}
-	defer resource.Close()
 
-	dataset, err := cuvs.NewTensor(datasetvec)
-	if err != nil {
-		return
+	dim := len(datasetvec[0])
+	flattenedDataset := make([]float32, len(datasetvec)*dim)
+	for i, v := range datasetvec {
+		copy(flattenedDataset[i*dim:(i+1)*dim], v)
 	}
-	defer dataset.Close()
 
-	index, err := brute_force.CreateIndex()
-	if err != nil {
-		return
+	flattenedQueries := make([]float32, len(queriesvec)*dim)
+	for i, v := range queriesvec {
+		copy(flattenedQueries[i*dim:(i+1)*dim], v)
 	}
-	defer index.Close()
 
-	queries, err := cuvs.NewTensor(queriesvec)
+	deviceID := 0
+	nthread := uint32(1)
+	bf, err := cuvs.NewGpuBruteForce[float32](flattenedDataset, uint64(len(datasetvec)), uint32(dim), distanceType, nthread, deviceID)
 	if err != nil {
-		return
+		return nil, nil, err
 	}
-	defer queries.Close()
+	defer bf.Destroy()
 
-	neighbors, err := cuvs.NewTensorOnDevice[int64](&resource, []int64{int64(len(queriesvec)), int64(limit)})
+	err = bf.Load()
 	if err != nil {
-		return
+		return nil, nil, err
 	}
-	defer neighbors.Close()
 
-	distances, err := cuvs.NewTensorOnDevice[float32](&resource, []int64{int64(len(queriesvec)), int64(limit)})
+	neighbors, distances, err := bf.Search(flattenedQueries, uint64(len(queriesvec)), uint32(dim), uint32(limit))
 	if err != nil {
-		return
+		return nil, nil, err
 	}
-	defer distances.Close()
 
-	if _, err = dataset.ToDevice(&resource); err != nil {
-		return
-	}
-
-	if err = resource.Sync(); err != nil {
-		return
-	}
-
-	err = brute_force.BuildIndex(resource, &dataset, distanceType, 2.0, index)
-	if err != nil {
-		//os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed %v\n", err))
-		//os.Stderr.WriteString(fmt.Sprintf("BruteForceIndex: build index failed centers %v\n", datasetvec))
-		return
+	retdistances = make([]float64, len(distances))
+	for i, d := range distances {
+		retdistances[i] = float64(d)
 	}
 
-	if err = resource.Sync(); err != nil {
-		return
-	}
-	//os.Stderr.WriteString("built brute force index\n")
+	retkeys = neighbors
+	return
+}
 
-	if _, err = queries.ToDevice(&resource); err != nil {
-		return
-	}
+func TestIssueGpu(t *testing.T) {
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
+
+		defer wg.Done()
+
+		dimension := uint(128)
+		dsize := 100000
+		nlist := 128
+		vecs := make([][]float32, dsize)
+		for i := range vecs {
+			vecs[i] = make([]float32, dimension)
+			for j := range vecs[i] {
+				vecs[i][j] = rand.Float32()
+			}
+		}
 
-	//os.Stderr.WriteString("brute force index search Runing....\n")
-	err = brute_force.SearchIndex(resource, *index, &queries, &neighbors, &distances)
-	if err != nil {
-		return
-	}
-	//os.Stderr.WriteString("brute force index search finished Runing....\n")
+		_, err := getCenters(vecs, int(dimension), nlist, cuvs.L2Expanded, 10)
+		require.NoError(t, err)
+	}()
+	wg.Wait()
+}
 
-	if _, err = neighbors.ToHost(&resource); err != nil {
-		return
-	}
-	//os.Stderr.WriteString("brute force index search neighbour to host done....\n")
+func TestIssueIvfAndBruteForceForIssue(t *testing.T) {
+	var wg1 sync.WaitGroup
+	wg1.Add(1)
+	go func() {
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
+
+		defer wg1.Done()
+
+		dimension := uint(128)
+		limit := uint(1)
+		dsize := 100000
+		nlist := 128
+		vecs := make([][]float32, dsize)
+		for i := range vecs {
+			vecs[i] = make([]float32, dimension)
+			for j := range vecs[i] {
+				vecs[i][j] = rand.Float32()
+			}
+		}
+		queries := vecs[:8192]
 
-	if _, err = distances.ToHost(&resource); err != nil {
-		return
-	}
-	//os.Stderr.WriteString("brute force index search distances to host done....\n")
+		centers, err := getCenters(vecs, int(dimension), nlist, cuvs.L2Expanded, 10)
+		require.NoError(t, err)
 
-	if err = resource.Sync(); err != nil {
-		return
-	}
+		fmt.Println("centers DONE")
 
-	//os.Stderr.WriteString("brute force index search return result....\n")
-	neighborsSlice, err := neighbors.Slice()
-	if err != nil {
-		return
-	}
+		var wg sync.WaitGroup
 
-	distancesSlice, err := distances.Slice()
-	if err != nil {
-		return
-	}
+		for n := 0; n < 8; n++ {
+			wg.Add(1)
+			go func() {
+				defer wg.Done()
 
-	//fmt.Printf("flattened %v\n", flatten)
-	retdistances = make([]float64, len(distancesSlice)*int(limit))
-	for i := range distancesSlice {
-		for j, dist := range distancesSlice[i] {
-			retdistances[i*int(limit)+j] = float64(dist)
-		}
-	}
+				runtime.LockOSThread()
+				defer runtime.UnlockOSThread()
 
-	keys := make([]int64, len(neighborsSlice)*int(limit))
-	for i := range neighborsSlice {
-		for j, key := range neighborsSlice[i] {
-			keys[i*int(limit)+j] = int64(key)
+				for i := 0; i < 100; i++ { // Reduced iteration count for faster test run
+					_, _, err := Search(centers, queries, limit, cuvs.L2Expanded)
+					require.NoError(t, err)
+				}
+			}()
 		}
-	}
-	retkeys = keys
-	//os.Stderr.WriteString("brute force index search RETURN NOW....\n")
-	return
-}
 
-func TestIvfAndBruteForceForIssue(t *testing.T) {
-
-	dimension := uint(128)
-	limit := uint(1)
-	/*
-		ncpu := uint(1)
-		elemsz := uint(4) // float32
-	*/
-
-	dsize := 100000
-	nlist := 128
-	vecs := make([][]float32, dsize)
-	for i := range vecs {
-		vecs[i] = make([]float32, dimension)
-		for j := range vecs[i] {
-			vecs[i][j] = rand.Float32()
-		}
-	}
-	queries := vecs[:8192]
-
-	centers, err := getCenters(vecs, int(dimension), nlist, cuvs.DistanceL2, 10)
-	require.NoError(t, err)
-
-	var wg sync.WaitGroup
-
-	for n := 0; n < 4; n++ {
-
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			for i := 0; i < 1000; i++ {
-				_, _, err := Search(centers, queries, limit, cuvs.DistanceL2)
-				require.NoError(t, err)
-
-				/*
-					keys_i64, ok := keys.([]int64)
-					require.Equal(t, ok, true)
-
-					for j, key := range keys_i64 {
-						require.Equal(t, key, int64(j))
-						require.Equal(t, distances[j], float64(0))
-					}
-				*/
-				// fmt.Printf("keys %v, dist %v\n", keys, distances)
-			}
-		}()
-	}
-
-	wg.Wait()
+		wg.Wait()
+	}()
 
+	wg1.Wait()
 }
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer.go b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer.go
index bfba4529db9f4..521d8b6bef005 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer.go
@@ -17,12 +17,14 @@ package elkans
 import (
 	"context"
 	"math"
-	"math/rand"
+	"math/rand/v2"
 	"runtime"
 	"sync/atomic"
 
 	"github.com/matrixorigin/matrixone/pkg/common/concurrent"
+	"github.com/matrixorigin/matrixone/pkg/common/malloc"
 	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/common/util"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/logutil"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/ivfflat/kmeans"
@@ -50,9 +52,13 @@ type ElkanClusterer[T types.RealNumbers] struct {
 
 	// for each of the k centroids, we keep track of the following data
 	centroids                   [][]T
+	nextCentroids               [][]T
 	halfInterCentroidDistMatrix [][]T
 	minHalfInterCentroidDist    []T
 
+	membersCount      []int64
+	centroidShiftDist []T
+
 	// thresholds
 	maxIterations  int     // e in paper
 	deltaThreshold float64 // used for early convergence. we are not using it right now.
@@ -63,9 +69,11 @@ type ElkanClusterer[T types.RealNumbers] struct {
 
 	distFn    metric.DistanceFunction[T]
 	initType  kmeans.InitType
-	rand      *rand.Rand
 	normalize bool
 
+	// allocator tracking
+	deallocators []malloc.Deallocator
+
 	// number of worker threads
 	nworker int
 }
@@ -96,24 +104,59 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
 		return nil, err
 	}
 
-	assignments := make([]int, len(vectors))
-	var metas = make([]vectorMeta[T], len(vectors))
+	allocator := malloc.NewCAllocator()
+	var deallocators []malloc.Deallocator
+
+	allocSlice := func(size uint64) []byte {
+		slice, deallocator, err := allocator.Allocate(size, malloc.NoClear)
+		if err != nil {
+			panic(err) // OOM
+		}
+		deallocators = append(deallocators, deallocator)
+		return slice
+	}
+
+	// allocate assignments
+	assignmentsBytes := allocSlice(uint64(len(vectors) * int(util.UnsafeSizeOf[int]())))
+	assignments := util.UnsafeSliceCastToLength[int](assignmentsBytes, len(vectors))
+	for i := range assignments {
+		assignments[i] = 0
+	}
+
+	// allocate metas
+	metasBytes := allocSlice(uint64(len(vectors) * int(util.UnsafeSizeOf[vectorMeta[T]]())))
+	metas := util.UnsafeSliceCastToLength[vectorMeta[T]](metasBytes, len(vectors))
 	for i := range metas {
+		lowerBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[T]()))
+		lower := util.UnsafeSliceCastToLength[T](lowerBytes, clusterCnt)
+		for j := range lower {
+			lower[j] = 0
+		}
 		metas[i] = vectorMeta[T]{
-			lower:     make([]T, clusterCnt),
+			lower:     lower,
 			upper:     0,
 			recompute: true,
 		}
 	}
 
-	centroidDist := make([][]T, clusterCnt)
+	// allocate centroidDist
+	centroidDistBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[[]T]()))
+	centroidDist := util.UnsafeSliceCastToLength[[]T](centroidDistBytes, clusterCnt)
 	for i := range centroidDist {
-		centroidDist[i] = make([]T, clusterCnt)
+		distBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[T]()))
+		centroidDist[i] = util.UnsafeSliceCastToLength[T](distBytes, clusterCnt)
 	}
-	minCentroidDist := make([]T, clusterCnt)
+
+	// allocate minCentroidDist
+	minCentroidDistBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[T]()))
+	minCentroidDist := util.UnsafeSliceCastToLength[T](minCentroidDistBytes, clusterCnt)
 
 	distanceFunction, normalize, err := metric.ResolveKmeansDistanceFn[T](distanceType, spherical)
 	if err != nil {
+		// Before returning, we must clean up already allocated memory.
+		for _, d := range deallocators {
+			d.Deallocate()
+		}
 		return nil, err
 	}
 
@@ -121,6 +164,30 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
 		nworker = runtime.NumCPU()
 	}
 
+	// allocate centroids
+	centroidsBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[[]T]()))
+	centroids := util.UnsafeSliceCastToLength[[]T](centroidsBytes, clusterCnt)
+	for i := range centroids {
+		cBytes := allocSlice(uint64(len(vectors[0])) * uint64(util.UnsafeSizeOf[T]()))
+		centroids[i] = util.UnsafeSliceCastToLength[T](cBytes, len(vectors[0]))
+	}
+
+	// allocate nextCentroids
+	nextCentroidsBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[[]T]()))
+	nextCentroids := util.UnsafeSliceCastToLength[[]T](nextCentroidsBytes, clusterCnt)
+	for i := range nextCentroids {
+		ncBytes := allocSlice(uint64(len(vectors[0])) * uint64(util.UnsafeSizeOf[T]()))
+		nextCentroids[i] = util.UnsafeSliceCastToLength[T](ncBytes, len(vectors[0]))
+	}
+
+	// allocate membersCount
+	membersCountBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[int64]()))
+	membersCount := util.UnsafeSliceCastToLength[int64](membersCountBytes, clusterCnt)
+
+	// allocate centroidShiftDist
+	centroidShiftDistBytes := allocSlice(uint64(clusterCnt) * uint64(util.UnsafeSizeOf[T]()))
+	centroidShiftDist := util.UnsafeSliceCastToLength[T](centroidShiftDistBytes, clusterCnt)
+
 	return &ElkanClusterer[T]{
 		maxIterations:  maxIterations,
 		deltaThreshold: deltaThreshold,
@@ -129,22 +196,30 @@ func NewKMeans[T types.RealNumbers](vectors [][]T, clusterCnt,
 		assignments: assignments,
 		vectorMetas: metas,
 
-		//centroids will be initialized by InitCentroids()
+		centroids:                   centroids,
+		nextCentroids:               nextCentroids,
 		halfInterCentroidDistMatrix: centroidDist,
 		minHalfInterCentroidDist:    minCentroidDist,
 
+		membersCount:      membersCount,
+		centroidShiftDist: centroidShiftDist,
+
 		distFn:     distanceFunction,
 		initType:   initType,
 		clusterCnt: clusterCnt,
 		vectorCnt:  len(vectors),
 
-		rand:      rand.New(rand.NewSource(kmeans.DefaultRandSeed)),
-		normalize: normalize,
-		nworker:   nworker,
+		normalize:    normalize,
+		deallocators: deallocators,
+		nworker:      nworker,
 	}, nil
 }
 
 func (km *ElkanClusterer[T]) Close() error {
+	for _, d := range km.deallocators {
+		d.Deallocate()
+	}
+	km.deallocators = nil
 	return nil
 }
 
@@ -174,13 +249,21 @@ func (km *ElkanClusterer[T]) InitCentroids(ctx context.Context) error {
 	}
 
 	var ok bool
-	km.centroids, ok = anycentroids.([][]T)
+	initCentroids, ok := anycentroids.([][]T)
 	if !ok {
 		return moerr.NewInternalErrorNoCtx("InitCentroids not return [][]float32|float64")
 	}
 
 	// Add a dimension check for the initialized centroids
-	return checkCentroidDimension(km.centroids, len(km.vectorList[0]))
+	if err := checkCentroidDimension(initCentroids, len(km.vectorList[0])); err != nil {
+		return err
+	}
+
+	for i := range initCentroids {
+		copy(km.centroids[i], initCentroids[i])
+	}
+
+	return nil
 }
 
 // Cluster returns the final centroids and the error if any.
@@ -207,6 +290,8 @@ func (km *ElkanClusterer[T]) Cluster(ctx context.Context) (any, error) {
 
 func (km *ElkanClusterer[T]) elkansCluster(ctx context.Context) ([][]T, error) {
 
+	rnd := rand.New(rand.NewPCG(uint64(kmeans.DefaultRandSeed), 0))
+
 	for iter := 0; ; iter++ {
 		km.computeCentroidDistances(ctx) // step 1
 
@@ -215,11 +300,11 @@ func (km *ElkanClusterer[T]) elkansCluster(ctx context.Context) ([][]T, error) {
 			return nil, err
 		}
 
-		newCentroids := km.recalculateCentroids(ctx) // step 4
+		newCentroids := km.recalculateCentroids(ctx, rnd, km.nextCentroids, km.membersCount) // step 4
 
-		km.updateBounds(ctx, newCentroids) // step 5 and 6
+		km.updateBounds(ctx, newCentroids, km.centroidShiftDist) // step 5 and 6
 
-		km.centroids = newCentroids // step 7
+		km.centroids, km.nextCentroids = newCentroids, km.centroids // step 7
 
 		logutil.Debugf("kmeans iter=%d, changes=%d\n", iter, changes)
 		if iter != 0 && km.isConverged(iter, changes) {
@@ -480,12 +565,14 @@ func (km *ElkanClusterer[T]) assignData(ctx context.Context) (int, error) {
 }
 
 // recalculateCentroids calculates the new mean centroids based on the new assignments.
-func (km *ElkanClusterer[T]) recalculateCentroids(ctx context.Context) [][]T {
-	membersCount := make([]int64, km.clusterCnt)
-
-	newCentroids := make([][]T, km.clusterCnt)
+func (km *ElkanClusterer[T]) recalculateCentroids(ctx context.Context, rnd *rand.Rand, newCentroids [][]T, membersCount []int64) [][]T {
+	for i := range membersCount {
+		membersCount[i] = 0
+	}
 	for c := range newCentroids {
-		newCentroids[c] = make([]T, len(km.vectorList[0]))
+		for i := range newCentroids[c] {
+			newCentroids[c][i] = 0
+		}
 	}
 
 	// sum of all the members of the cluster
@@ -501,14 +588,12 @@ func (km *ElkanClusterer[T]) recalculateCentroids(ctx context.Context) [][]T {
 	for c := range newCentroids {
 		if membersCount[c] == 0 {
 			// pick a vector randomly from existing vectors as the new centroid
-			//newCentroids[c] = km.vectorList[km.rand.Intn(km.vectorCnt)]
+			//newCentroids[c] = km.vectorList[rnd.IntN(km.vectorCnt)]
 
 			//// if the cluster is empty, reinitialize it to a random vector, since you can't find the mean of an empty set
-			randVector := make([]T, len(km.vectorList[0]))
-			for l := range randVector {
-				randVector[l] = T(km.rand.Float32())
+			for l := range newCentroids[c] {
+				newCentroids[c][l] = T(rnd.Float32())
 			}
-			newCentroids[c] = randVector
 
 			// normalize the random vector
 			if km.normalize {
@@ -516,8 +601,13 @@ func (km *ElkanClusterer[T]) recalculateCentroids(ctx context.Context) [][]T {
 			}
 		} else {
 			// find the mean of the cluster members
-			// note: we don't need to normalize here, since the vectors are already normalized
 			metric.ScaleInPlace[T](newCentroids[c], 1.0/T(membersCount[c]))
+
+			// For spherical k-means, the mean of normalized vectors must be re-normalized
+			// to project the centroid back onto the unit hypersphere.
+			if km.normalize {
+				metric.NormalizeL2(newCentroids[c], newCentroids[c])
+			}
 		}
 
 	}
@@ -526,11 +616,10 @@ func (km *ElkanClusterer[T]) recalculateCentroids(ctx context.Context) [][]T {
 }
 
 // updateBounds updates the lower and upper bounds for each vector.
-func (km *ElkanClusterer[T]) updateBounds(ctx context.Context, newCentroid [][]T) (err error) {
+func (km *ElkanClusterer[T]) updateBounds(ctx context.Context, newCentroid [][]T, centroidShiftDist []T) (err error) {
 
 	// compute the centroid shift distance matrix once.
 	// d(c', m(c')) in the paper
-	centroidShiftDist := make([]T, km.clusterCnt)
 	for c := 0; c < km.clusterCnt; c++ {
 		centroidShiftDist[c], err = km.distFn(km.centroids[c], newCentroid[c])
 		if err != nil {
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_bench_test.go b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_bench_test.go
index 465e0a4fcddc5..899cfad72106f 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_bench_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_bench_test.go
@@ -16,7 +16,7 @@ package elkans
 
 import (
 	"context"
-	"math/rand"
+	"math/rand/v2"
 	"strconv"
 	"testing"
 
@@ -87,7 +87,7 @@ func Benchmark_kmeans(b *testing.B) {
 }
 
 func populateRandData(rowCnt int, dim int, vecs [][]float64) {
-	random := rand.New(rand.NewSource(kmeans.DefaultRandSeed))
+	random := rand.New(rand.NewPCG(uint64(kmeans.DefaultRandSeed), 0))
 	for r := 0; r < rowCnt; r++ {
 		vecs[r] = make([]float64, dim)
 		for c := 0; c < dim; c++ {
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_test.go b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_test.go
index 28f2ada1ba98f..79c485a7ddd29 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/clusterer_test.go
@@ -16,6 +16,8 @@ package elkans
 
 import (
 	"context"
+	"math"
+	"math/rand/v2"
 	"reflect"
 	"testing"
 
@@ -435,12 +437,9 @@ func Test_Cluster(t *testing.T) {
 				initType:       kmeans.Random,
 			},
 			want: [][]float64{
-				//{0.15915269938161652, 0.31830539876323305, 0.5757527355814478, 0.7349054349630643}, // approx {1, 2, 3.6666666666666665, 4.666666666666666}
-				//{0.8077006350571528, 0.26637173227965466, 0.3230802540228611, 0.4038503175285764},  // approx {10, 3.333333333333333, 4, 5}
 				{10, 3.333333333333333, 4, 5},
 				{1, 2, 3.6666666666666665, 4.666666666666666},
 			},
-			//wantSSE: 0.0657884123589134,
 			wantSSE: 12,
 			wantErr: false,
 		},
@@ -740,7 +739,15 @@ func TestElkanClusterer_recalculateCentroids(t *testing.T) {
 				// NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test.
 				// Here we are only testing the working of recalculateCentroids() function.
 
-				got := ekm.recalculateCentroids(ctx)
+				rnd := rand.New(rand.NewPCG(uint64(kmeans.DefaultRandSeed), 0))
+
+				newCentroids := make([][]float64, ekm.clusterCnt)
+				for i := range newCentroids {
+					newCentroids[i] = make([]float64, len(ekm.vectorList[0]))
+				}
+				membersCount := make([]int64, ekm.clusterCnt)
+
+				got := ekm.recalculateCentroids(ctx, rnd, newCentroids, membersCount)
 				if !assertx.InEpsilonF64Slices(tt.want.centroids, got) {
 					t.Errorf("centroids got = %v, want %v", got, tt.want.centroids)
 				}
@@ -880,7 +887,8 @@ func TestElkanClusterer_updateBounds(t *testing.T) {
 
 				// NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test.
 				// Here we are only testing the working of updateBounds() function.
-				ekm.updateBounds(ctx, tt.state.newCentroids)
+				centroidShiftDist := make([]float64, ekm.clusterCnt)
+				ekm.updateBounds(ctx, tt.state.newCentroids, centroidShiftDist)
 
 				for i := 0; i < len(tt.want.vectorMetas); i++ {
 					if !assertx.InEpsilonF64Slice(tt.want.vectorMetas[i].lower, ekm.vectorMetas[i].lower) {
@@ -1032,7 +1040,8 @@ func TestElkanClusterer_updateBounds_Error(t *testing.T) {
 
 				// NOTE: here km.Normalize() is skipped as we not calling km.Cluster() in this test.
 				// Here we are only testing the working of updateBounds() function.
-				err := ekm.updateBounds(ctx, tt.state.newCentroids)
+				centroidShiftDist := make([]float64, ekm.clusterCnt)
+				err := ekm.updateBounds(ctx, tt.state.newCentroids, centroidShiftDist)
 				require.NotNil(t, err)
 			} else if !ok {
 				t.Errorf("km not of type ElkanClusterer")
@@ -1049,3 +1058,27 @@ func Test_checkCentroidDimension(t *testing.T) {
 	err = checkCentroidDimension(c, 3)
 	require.NoError(t, err)
 }
+
+func TestClusterer_Spherical(t *testing.T) {
+	ctx := context.Background()
+	// Vectors on unit circle
+	vectors := [][]float32{
+		{1, 0}, {0.99, 0.1},
+		{0, 1}, {0.1, 0.99},
+	}
+	km, err := NewKMeans(vectors, 2, 10, 0.01, metric.Metric_CosineDistance, kmeans.Random, true, 1)
+	require.NoError(t, err)
+
+	res, err := km.Cluster(ctx)
+	require.NoError(t, err)
+	centroids := res.([][]float32)
+
+	// Check if centroids are normalized
+	for _, c := range centroids {
+		norm := float32(0)
+		for _, v := range c {
+			norm += v * v
+		}
+		require.InDelta(t, 1.0, math.Sqrt(float64(norm)), 1e-5)
+	}
+}
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/initializer.go b/pkg/vectorindex/ivfflat/kmeans/elkans/initializer.go
index 08c3a416d69f7..19c664fba7eb4 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/initializer.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/initializer.go
@@ -16,7 +16,7 @@ package elkans
 
 import (
 	"context"
-	"math/rand"
+	"math/rand/v2"
 	"runtime"
 	"sync"
 
@@ -35,22 +35,20 @@ type Initializer interface {
 
 // Random initializes the centroids with random centroids from the vector list.
 type Random struct {
-	rand rand.Rand
 }
 
 func NewRandomInitializer() Initializer {
-	return &Random{
-		rand: *rand.New(rand.NewSource(kmeans.DefaultRandSeed)),
-	}
+	return &Random{}
 }
 
 func (r *Random) InitCentroids(ctx context.Context, vectors any, k int) (_centroids any, _err error) {
+	rnd := rand.New(rand.NewPCG(uint64(kmeans.DefaultRandSeed), 0))
 
 	switch _vecs := vectors.(type) {
 	case [][]float32:
 		centroids := make([][]float32, k)
 		for i := 0; i < k; i++ {
-			randIdx := r.rand.Intn(len(_vecs))
+			randIdx := rnd.IntN(len(_vecs))
 			centroids[i] = _vecs[randIdx]
 		}
 		return centroids, nil
@@ -58,7 +56,7 @@ func (r *Random) InitCentroids(ctx context.Context, vectors any, k int) (_centro
 	case [][]float64:
 		centroids := make([][]float64, k)
 		for i := 0; i < k; i++ {
-			randIdx := r.rand.Intn(len(_vecs))
+			randIdx := rnd.IntN(len(_vecs))
 			centroids[i] = _vecs[randIdx]
 		}
 		return centroids, nil
@@ -76,13 +74,11 @@ func (r *Random) InitCentroids(ctx context.Context, vectors any, k int) (_centro
 // Using random, we could get 3 centroids: 1&2 which are close to each other and part of cluster 1. 3 is in the middle of 2&3.
 // Using kmeans++, we are sure that 3 centroids are farther away from each other.
 type KMeansPlusPlus[T types.RealNumbers] struct {
-	rand   rand.Rand
 	distFn metric.DistanceFunction[T]
 }
 
 func NewKMeansPlusPlusInitializer[T types.RealNumbers](distFn metric.DistanceFunction[T]) Initializer {
 	return &KMeansPlusPlus[T]{
-		rand:   *rand.New(rand.NewSource(kmeans.DefaultRandSeed)),
 		distFn: distFn,
 	}
 }
@@ -97,8 +93,10 @@ func (kpp *KMeansPlusPlus[T]) InitCentroids(ctx context.Context, _vectors any, k
 	numSamples := len(vectors)
 	centroids := make([][]T, k)
 
+	rnd := rand.New(rand.NewPCG(uint64(kmeans.DefaultRandSeed), 0))
+
 	// 1. start with a random center
-	centroids[0] = vectors[kpp.rand.Intn(numSamples)]
+	centroids[0] = vectors[rnd.IntN(numSamples)]
 
 	distances := make([]T, numSamples)
 	for j := range distances {
@@ -124,6 +122,7 @@ func (kpp *KMeansPlusPlus[T]) InitCentroids(ctx context.Context, _vectors any, k
 				subvec := vectors[start:end:end]
 				subdist := distances[start:end:end]
 
+				var localDist T
 				for i := range subvec {
 
 					if i%100 == 0 && ctx.Err() != nil {
@@ -139,14 +138,16 @@ func (kpp *KMeansPlusPlus[T]) InitCentroids(ctx context.Context, _vectors any, k
 					}
 
 					distance *= distance
-					mutex.Lock()
 					if distance < subdist[i] {
 						subdist[i] = distance
 					}
-					totalDistToExistingCenters += subdist[i]
-					mutex.Unlock()
+					localDist += subdist[i]
 				}
 
+				mutex.Lock()
+				totalDistToExistingCenters += localDist
+				mutex.Unlock()
+
 				return
 			})
 
@@ -157,7 +158,7 @@ func (kpp *KMeansPlusPlus[T]) InitCentroids(ctx context.Context, _vectors any, k
 		// 3. choose the next random center, using a weighted probability distribution
 		// where it is chosen with probability proportional to D(x)^2
 		// Ref: https://en.wikipedia.org/wiki/K-means%2B%2B#Improved_initialization_algorithm
-		target := T(kpp.rand.Float32()) * totalDistToExistingCenters
+		target := T(rnd.Float32()) * totalDistToExistingCenters
 		for idx, distance := range distances {
 			target -= distance
 			// due to floating point inaccuracies, target may be > 0 even after subtracting all distances.
diff --git a/pkg/vectorindex/ivfflat/kmeans/elkans/initializer_test.go b/pkg/vectorindex/ivfflat/kmeans/elkans/initializer_test.go
index 51ff1c5549144..37e9737369bfb 100644
--- a/pkg/vectorindex/ivfflat/kmeans/elkans/initializer_test.go
+++ b/pkg/vectorindex/ivfflat/kmeans/elkans/initializer_test.go
@@ -53,12 +53,9 @@ func TestRandom_InitCentroids(t *testing.T) {
 				k: 2,
 			},
 			wantCentroids: [][]float64{
-				// NOTE: values of random initialization need not be farther apart, it is random.
-				// NOTE: we get the same random values in the test case because we are using a constant seed value.
+				{10, 3, 4, 5},
 				{1, 2, 4, 5},
-				{1, 2, 3, 4},
-			},
-		},
+			}},
 	}
 	ctx := context.Background()
 
@@ -108,8 +105,8 @@ func TestKMeansPlusPlus_InitCentroids(t *testing.T) {
 			},
 			// Kmeans++ picked the relatively farthest points as the initial centroids
 			wantCentroids: [][]float64{
+				{10, 3, 4, 5},
 				{1, 2, 4, 5},
-				{10, 5, 4, 5},
 			},
 		},
 	}
diff --git a/pkg/vectorindex/ivfflat/search.go b/pkg/vectorindex/ivfflat/search.go
index 4fa425042cdb1..b63ed9e0d2079 100644
--- a/pkg/vectorindex/ivfflat/search.go
+++ b/pkg/vectorindex/ivfflat/search.go
@@ -25,6 +25,7 @@ import (
 	"github.com/matrixorigin/matrixone/pkg/common/util"
 	"github.com/matrixorigin/matrixone/pkg/container/types"
 	"github.com/matrixorigin/matrixone/pkg/container/vector"
+	"github.com/matrixorigin/matrixone/pkg/logutil"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/brute_force"
 	"github.com/matrixorigin/matrixone/pkg/vectorindex/cache"
@@ -61,11 +62,10 @@ type IvfflatSearch[T types.RealNumbers] struct {
 }
 
 type IvfflatMeta struct {
-	CenterStats          map[int64]int64
-	Nbits                uint64
-	K                    uint32
-	Seed                 uint64
-	SmallCenterThreshold int64
+	Nbits    uint64
+	K        uint32
+	Seed     uint64
+	DataSize int64
 }
 
 // LoadStats get the number of entries per centroid
@@ -75,24 +75,11 @@ func (idx *IvfflatSearchIndex[T]) LoadStats(
 	tblcfg vectorindex.IndexTableConfig,
 	nthread int64) error {
 
-	idx.Meta.SmallCenterThreshold = int64(0)
-	if sqlproc.GetResolveVariableFunc() != nil {
-		val, err := sqlproc.GetResolveVariableFunc()("ivf_small_centroid_threshold", true, false)
-		if err != nil {
-			return err
-		}
-		idx.Meta.SmallCenterThreshold = val.(int64)
-	}
-
-	stats := make(map[int64]int64)
-
-	sql := fmt.Sprintf("SELECT `%s`, COUNT(`%s`) FROM `%s`.`%s` WHERE `%s` = %d GROUP BY `%s`",
-		catalog.SystemSI_IVFFLAT_TblCol_Entries_id,
-		catalog.SystemSI_IVFFLAT_TblCol_Entries_pk,
+	logutil.Infof("IVFFLAT START: gets data size")
+	sql := fmt.Sprintf("SELECT COUNT(1) FROM `%s`.`%s` WHERE `%s` = %d",
 		tblcfg.DbName, tblcfg.EntriesTable,
 		catalog.SystemSI_IVFFLAT_TblCol_Entries_version,
 		idx.Version,
-		catalog.SystemSI_IVFFLAT_TblCol_Entries_id,
 	)
 
 	res, err := runSql(sqlproc, sql)
@@ -101,19 +88,14 @@ func (idx *IvfflatSearchIndex[T]) LoadStats(
 	}
 	defer res.Close()
 
-	for _, bat := range res.Batches {
-		cntvec := bat.Vecs[1]
-		idvec := bat.Vecs[0]
-
-		for i := 0; i < bat.RowCount(); i++ {
-			cid := vector.GetFixedAtNoTypeCheck[int64](idvec, i)
-			cnt := vector.GetFixedAtNoTypeCheck[int64](cntvec, i)
-			stats[cid] = cnt
-		}
-	}
+	// batch cannot be empty
+	bat := res.Batches[0]
 
-	idx.Meta.CenterStats = stats
+	cnt := vector.GetFixedAtNoTypeCheck[int64](bat.Vecs[0], 0)
+	idx.Meta.DataSize = int64(cnt)
+	logutil.Infof("IVFFLAT END: gets data size = %d", cnt)
 	return nil
+
 }
 
 // load all entries primary key per centroid and build bloomfilter per centroids
@@ -130,19 +112,8 @@ func (idx *IvfflatSearchIndex[T]) LoadBloomFilters(
 		return
 	}
 
-	// calculate the row count for bloomfilter
-	if idx.Meta.CenterStats == nil {
-		// no stats
-		return
-	}
-
-	maxv := int64(0)
-	for _, v := range idx.Meta.CenterStats {
-		if v > maxv {
-			maxv = v
-		}
-	}
-
+	// average size per bucket to estimate the bloomfilter size
+	maxv := idx.Meta.DataSize / int64(idxcfg.Ivfflat.Lists)
 	if maxv == 0 {
 		// no entries found
 		return
@@ -182,6 +153,7 @@ func (idx *IvfflatSearchIndex[T]) LoadBloomFilters(
 		}
 	}()
 
+	logutil.Infof("IVFFLAT START: get bloomfilter")
 	for i := 0; i < int(idxcfg.Ivfflat.Lists); i++ {
 		err = func() error {
 			bf := bloomfilters[i]
@@ -210,12 +182,14 @@ func (idx *IvfflatSearchIndex[T]) LoadBloomFilters(
 			return
 		}
 	}
-
+	logutil.Infof("IVFFLAT END: get bloomfilter")
 	return
 }
 
 func (idx *IvfflatSearchIndex[T]) LoadCentroids(proc *sqlexec.SqlProcess, idxcfg vectorindex.IndexConfig, tblcfg vectorindex.IndexTableConfig, nthread int64) error {
 
+	logutil.Infof("IVFFLAT START: Load Centroids")
+	defer logutil.Infof("IVFFLAT END: Load Centroids")
 	// load centroids
 	sql := fmt.Sprintf(
 		"SELECT `%s`, `%s` FROM `%s`.`%s` WHERE `%s` = %d",
@@ -264,7 +238,7 @@ func (idx *IvfflatSearchIndex[T]) LoadCentroids(proc *sqlexec.SqlProcess, idxcfg
 		return moerr.NewInternalErrorNoCtx("number of centroids in db != Nlist")
 	}
 
-	bfidx, err := brute_force.NewBruteForceIndex[T](centroids, idxcfg.Ivfflat.Dimensions, metric.MetricType(idxcfg.Ivfflat.Metric), uint(elemsz))
+	bfidx, err := brute_force.NewBruteForceIndex[T](centroids, idxcfg.Ivfflat.Dimensions, metric.MetricType(idxcfg.Ivfflat.Metric), uint(elemsz), uint(nthread))
 	if err != nil {
 		return err
 	}
@@ -311,40 +285,8 @@ func (idx *IvfflatSearchIndex[T]) LoadIndex(proc *sqlexec.SqlProcess, idxcfg vec
 	return nil
 }
 
-func (idx *IvfflatSearchIndex[T]) getCentroidsSum(centroids_ids []int64) uint64 {
-	total := uint64(0)
-
-	if idx.Meta.CenterStats == nil {
-		return total
-	}
-
-	for _, k := range centroids_ids {
-		cnt, ok := idx.Meta.CenterStats[k]
-		if ok {
-			total += uint64(cnt)
-		}
-	}
-	return total
-}
-
-// merge the small centroids
-func (idx *IvfflatSearchIndex[T]) findMergedCentroids(sqlproc *sqlexec.SqlProcess, centroids_ids []int64, idxcfg vectorindex.IndexConfig, probe uint) ([]int64, error) {
-	n := 0
-	nprobe := uint(0)
-
-	for _, k := range centroids_ids {
-		n++
-		nprobe++
-		cnt, ok := idx.Meta.CenterStats[k]
-		if ok && cnt < idx.Meta.SmallCenterThreshold {
-			nprobe--
-		}
-		if nprobe == probe {
-			break
-		}
-
-	}
-	return centroids_ids[:n], nil
+func (idx *IvfflatSearchIndex[T]) getCentroidsSum(centroids_ids []int64, nlists uint) uint64 {
+	return uint64(idx.Meta.DataSize * int64(len(centroids_ids)) / int64(nlists))
 }
 
 func (idx *IvfflatSearchIndex[T]) findCentroids(sqlproc *sqlexec.SqlProcess, query []T, distfn metric.DistanceFunction[T], idxcfg vectorindex.IndexConfig, probe uint, _ int64) ([]int64, error) {
@@ -359,23 +301,12 @@ func (idx *IvfflatSearchIndex[T]) findCentroids(sqlproc *sqlexec.SqlProcess, que
 	}
 
 	rtprobe := probe
-	if idx.Meta.CenterStats != nil && idx.Meta.SmallCenterThreshold > 0 {
-		rtprobe = probe * 2
-		if rtprobe > idxcfg.Ivfflat.Lists {
-			rtprobe = idxcfg.Ivfflat.Lists
-		}
-	}
-
 	queries := [][]T{query}
 	rt := vectorindex.RuntimeConfig{Limit: rtprobe, NThreads: 1}
 	keys, _, err := idx.Centroids.Search(sqlproc, queries, rt)
 	if err != nil {
 		return nil, err
 	}
-
-	if idx.Meta.CenterStats != nil && idx.Meta.SmallCenterThreshold > 0 {
-		return idx.findMergedCentroids(sqlproc, keys.([]int64), idxcfg, probe)
-	}
 	return keys.([]int64), nil
 }
 
@@ -477,7 +408,7 @@ func (idx *IvfflatSearchIndex[T]) getBloomFilter(
 
 	if len(idx.BloomFilters) == 0 {
 
-		sum := idx.getCentroidsSum(centroids_ids)
+		sum := idx.getCentroidsSum(centroids_ids, idxcfg.Ivfflat.Lists)
 		if uint64(keyvec.Length()) < sum {
 			// unique join keys size is smaller than entries in centroids
 			return buildBloomFilterWithUniqueJoinKeys(keyvec)
diff --git a/pkg/vectorindex/ivfflat/search_test.go b/pkg/vectorindex/ivfflat/search_test.go
index 88694b71323e4..8fe7e1746408f 100644
--- a/pkg/vectorindex/ivfflat/search_test.go
+++ b/pkg/vectorindex/ivfflat/search_test.go
@@ -86,58 +86,3 @@ func TestIvfSearchParserError(t *testing.T) {
 	_, _, err := idx.Search(sqlproc, idxcfg, tblcfg, v, rt, 4)
 	require.NotNil(t, err)
 }
-
-func TestFindMergedCentroids(t *testing.T) {
-	idx := &IvfflatSearchIndex[float32]{}
-	idxcfg := vectorindex.IndexConfig{}
-
-	// Case 1: CenterStats set, SmallCenterThreshold = 0
-	input := []int64{1, 2, 3, 4, 5}
-	probe := uint(2)
-	idx.Meta.CenterStats = map[int64]int64{
-		1: 100,
-		2: 100,
-		3: 100,
-		4: 100,
-		5: 100,
-	}
-	idx.Meta.SmallCenterThreshold = 0
-	res, err := idx.findMergedCentroids(nil, input, idxcfg, probe)
-	require.Nil(t, err)
-	require.Equal(t, []int64{1, 2}, res)
-
-	// Case 2: CenterStats set, with small centers
-	idx.Meta.SmallCenterThreshold = 50
-	idx.Meta.CenterStats = map[int64]int64{
-		1: 100, // Big
-		2: 10,  // Small
-		3: 100, // Big
-		4: 10,  // Small
-		5: 100, // Big
-	}
-
-	// probe = 2
-	// 1 (Big) -> nprobe=1
-	// 2 (Small) -> nprobe=1
-	// 3 (Big) -> nprobe=2 -> break
-	res, err = idx.findMergedCentroids(nil, input, idxcfg, probe)
-	require.Nil(t, err)
-	require.Equal(t, []int64{1, 2, 3}, res)
-
-	// Case 3: All small
-	idx.Meta.CenterStats = map[int64]int64{
-		1: 10, 2: 10, 3: 10, 4: 10, 5: 10,
-	}
-	res, err = idx.findMergedCentroids(nil, input, idxcfg, probe)
-	require.Nil(t, err)
-	require.Equal(t, input, res)
-
-	// Case 4: probe is large
-	idx.Meta.CenterStats = map[int64]int64{
-		1: 100, 2: 100, 3: 100, 4: 100, 5: 100,
-	}
-	probe = 10
-	res, err = idx.findMergedCentroids(nil, input, idxcfg, probe)
-	require.Nil(t, err)
-	require.Equal(t, input, res)
-}
diff --git a/pkg/vectorindex/metric/distance_func.go b/pkg/vectorindex/metric/distance_func.go
index cf8ffae96fb22..d4a0caba77ebf 100644
--- a/pkg/vectorindex/metric/distance_func.go
+++ b/pkg/vectorindex/metric/distance_func.go
@@ -1,3 +1,5 @@
+//go:build !(amd64 && goexperiment.simd)
+
 // Copyright 2023 Matrix Origin
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -121,10 +123,16 @@ func L1Distance[T types.RealNumbers](p, q []T) (T, error) {
 	// Helper function for inline absolute value.
 	// A good compiler might inline this automatically.
 	abs := func(x T) T {
-		if x < 0 {
-			return -x
+		switch xx := any(x).(type) {
+		case float32:
+			// math.Float32bits gets the uint32 representation
+			// &^ (AND NOT) with 1 << 31 clears the sign bit
+			return T(math.Float32frombits(math.Float32bits(xx) &^ (1 << 31)))
+		case float64:
+			return T(math.Abs(xx))
+		default:
+			return 0
 		}
-		return x
 	}
 
 	// Process the bulk of the data in chunks of 8.
@@ -438,87 +446,3 @@ func ScaleInPlace[T types.RealNumbers](v []T, scale T) {
 		v[i] *= scale
 	}
 }
-
-// IMPORTANT: Elkans Kmeans always use L2Distance for dense vector or images.  After getting the centroids, we can use other distance function
-// specified by user to assign vector to corresponding centroids (CENTROIDX JOIN / ProductL2).
-
-func ResolveKmeansDistanceFn[T types.RealNumbers](metric MetricType, spherical bool) (DistanceFunction[T], bool, error) {
-	if spherical {
-		return ResolveKmeansDistanceFnForSparse[T](metric)
-	}
-	return ResolveKmeansDistanceFnForDense[T](metric)
-}
-
-func ResolveKmeansDistanceFnForDense[T types.RealNumbers](metric MetricType) (DistanceFunction[T], bool, error) {
-	var distanceFunction DistanceFunction[T]
-	normalize := false
-	switch metric {
-	case Metric_L2Distance:
-		distanceFunction = L2Distance[T]
-		normalize = false
-	case Metric_L2sqDistance:
-		distanceFunction = L2Distance[T]
-		normalize = false
-	case Metric_InnerProduct:
-		distanceFunction = L2Distance[T]
-		normalize = false
-	case Metric_CosineDistance:
-		distanceFunction = L2Distance[T]
-		normalize = false
-	case Metric_L1Distance:
-		distanceFunction = L2Distance[T]
-		normalize = false
-	default:
-		return nil, normalize, moerr.NewInternalErrorNoCtx("invalid distance type")
-	}
-	return distanceFunction, normalize, nil
-}
-
-// IMPORTANT: Spherical Kmeans always use Spherical Distance / Cosine Similarity for Sparse vector or text embedding (TD-IDF).
-// After getting the centroids, we can use other distance function
-// specified by user to assign vector to corresponding centroids (CENTROIDX JOIN / ProductL2).
-func ResolveKmeansDistanceFnForSparse[T types.RealNumbers](metric MetricType) (DistanceFunction[T], bool, error) {
-	var distanceFunction DistanceFunction[T]
-	normalize := false
-	switch metric {
-	case Metric_L2Distance:
-		distanceFunction = L2Distance[T]
-		normalize = false
-	case Metric_L2sqDistance:
-		distanceFunction = L2Distance[T]
-		normalize = false
-	case Metric_InnerProduct:
-		distanceFunction = SphericalDistance[T]
-		normalize = true
-	case Metric_CosineDistance:
-		distanceFunction = SphericalDistance[T]
-		normalize = true
-	case Metric_L1Distance:
-		distanceFunction = L2Distance[T]
-		normalize = false
-	default:
-		return nil, normalize, moerr.NewInternalErrorNoCtx("invalid distance type")
-	}
-	return distanceFunction, normalize, nil
-}
-
-// ResolveDistanceFn is used for similarity score for search and assign vector to centroids (CENTROIDX JOIN / ProductL2).
-// IMPORTANT: Don't use it for Elkans Kmeans
-func ResolveDistanceFn[T types.RealNumbers](metric MetricType) (DistanceFunction[T], error) {
-	var distanceFunction DistanceFunction[T]
-	switch metric {
-	case Metric_L2Distance:
-		distanceFunction = L2DistanceSq[T]
-	case Metric_L2sqDistance:
-		distanceFunction = L2DistanceSq[T]
-	case Metric_InnerProduct:
-		distanceFunction = InnerProduct[T]
-	case Metric_CosineDistance:
-		distanceFunction = CosineDistance[T]
-	case Metric_L1Distance:
-		distanceFunction = L1Distance[T]
-	default:
-		return nil, moerr.NewInternalErrorNoCtx("invalid distance type")
-	}
-	return distanceFunction, nil
-}
diff --git a/pkg/vectorindex/metric/distance_func_amd64.go b/pkg/vectorindex/metric/distance_func_amd64.go
new file mode 100644
index 0000000000000..e7de3b09717d5
--- /dev/null
+++ b/pkg/vectorindex/metric/distance_func_amd64.go
@@ -0,0 +1,649 @@
+//go:build amd64 && go1.26 && goexperiment.simd
+
+// Copyright 2023 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metric
+
+import (
+	"math"
+	"simd/archsimd"
+
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+)
+
+var (
+	hasAVX512 = archsimd.X86.AVX512()
+)
+
+// Reduction Helpers - Simple Store and Tree Sum for maximum throughput
+func sumF32x16(v archsimd.Float32x16) float32 {
+	var a [16]float32
+	v.Store(&a)
+	s0 := (a[0] + a[1]) + (a[2] + a[3])
+	s1 := (a[4] + a[5]) + (a[6] + a[7])
+	s2 := (a[8] + a[9]) + (a[10] + a[11])
+	s3 := (a[12] + a[13]) + (a[14] + a[15])
+	return (s0 + s1) + (s2 + s3)
+}
+
+func sumF64x8(v archsimd.Float64x8) float64 {
+	var a [8]float64
+	v.Store(&a)
+	return (a[0] + a[1] + a[2] + a[3]) + (a[4] + a[5] + a[6] + a[7])
+}
+
+// L2 Distance Squared kernels
+func L2DistanceSqFloat32(a, b []float32) (float32, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension not matched")
+	}
+
+	var sum float32
+	i := 0
+
+	if hasAVX512 && n >= 64 {
+		acc0, acc1, acc2, acc3 := archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}
+		for i <= n-64 {
+			as, bs := a[i:i+64:i+64], b[i:i+64:i+64]
+			d0 := archsimd.LoadFloat32x16Slice(as[0:16]).Sub(archsimd.LoadFloat32x16Slice(bs[0:16]))
+			d1 := archsimd.LoadFloat32x16Slice(as[16:32]).Sub(archsimd.LoadFloat32x16Slice(bs[16:32]))
+			d2 := archsimd.LoadFloat32x16Slice(as[32:48]).Sub(archsimd.LoadFloat32x16Slice(bs[32:48]))
+			d3 := archsimd.LoadFloat32x16Slice(as[48:64]).Sub(archsimd.LoadFloat32x16Slice(bs[48:64]))
+
+			acc0 = d0.MulAdd(d0, acc0)
+			acc1 = d1.MulAdd(d1, acc1)
+			acc2 = d2.MulAdd(d2, acc2)
+			acc3 = d3.MulAdd(d3, acc3)
+			i += 64
+		}
+		sum += sumF32x16(acc0.Add(acc1).Add(acc2.Add(acc3)))
+	}
+
+	for i <= n-8 {
+		// BCE Hint
+		as := a[i : i+8 : i+8]
+		bs := b[i : i+8 : i+8]
+		d0 := as[0] - bs[0]
+		d1 := as[1] - bs[1]
+		d2 := as[2] - bs[2]
+		d3 := as[3] - bs[3]
+		d4 := as[4] - bs[4]
+		d5 := as[5] - bs[5]
+		d6 := as[6] - bs[6]
+		d7 := as[7] - bs[7]
+		sum += (d0*d0 + d1*d1) + (d2*d2 + d3*d3) + (d4*d4 + d5*d5) + (d6*d6 + d7*d7)
+		i += 8
+	}
+
+	for ; i < n; i++ {
+		diff := a[i] - b[i]
+		sum += diff * diff
+	}
+	return sum, nil
+}
+
+func InnerProductFloat32(a, b []float32) (float32, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension not matched")
+	}
+
+	var total float32
+	i := 0
+
+	if hasAVX512 && n >= 64 {
+		acc0, acc1, acc2, acc3 := archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}
+		for i <= n-64 {
+			as, bs := a[i:i+64:i+64], b[i:i+64:i+64]
+			acc0 = archsimd.LoadFloat32x16Slice(as[0:16]).MulAdd(archsimd.LoadFloat32x16Slice(bs[0:16]), acc0)
+			acc1 = archsimd.LoadFloat32x16Slice(as[16:32]).MulAdd(archsimd.LoadFloat32x16Slice(bs[16:32]), acc1)
+			acc2 = archsimd.LoadFloat32x16Slice(as[32:48]).MulAdd(archsimd.LoadFloat32x16Slice(bs[32:48]), acc2)
+			acc3 = archsimd.LoadFloat32x16Slice(as[48:64]).MulAdd(archsimd.LoadFloat32x16Slice(bs[48:64]), acc3)
+			i += 64
+		}
+		total += sumF32x16(acc0.Add(acc1).Add(acc2.Add(acc3)))
+	}
+
+	for i <= n-8 {
+		// BCE Hint
+		as := a[i : i+8 : i+8]
+		bs := b[i : i+8 : i+8]
+		total += as[0]*bs[0] + as[1]*bs[1] + as[2]*bs[2] + as[3]*bs[3] +
+			as[4]*bs[4] + as[5]*bs[5] + as[6]*bs[6] + as[7]*bs[7]
+		i += 8
+	}
+
+	for ; i < n; i++ {
+		total += a[i] * b[i]
+	}
+	return -total, nil
+}
+
+func L2Distance[T types.RealNumbers](v1, v2 []T) (T, error) {
+	if pf32, ok := any(v1).([]float32); ok {
+		dist, err := L2DistanceSqFloat32(pf32, any(v2).([]float32))
+		if err != nil {
+			return 0, err
+		}
+		return T(math.Sqrt(float64(dist))), nil
+	}
+	if pf64, ok := any(v1).([]float64); ok {
+		dist, err := L2DistanceSqFloat64(pf64, any(v2).([]float64))
+		if err != nil {
+			return 0, err
+		}
+		return T(math.Sqrt(dist)), nil
+	}
+	return 0, moerr.NewInternalErrorNoCtx("vector type not supported")
+}
+
+func L2DistanceSqFloat64(a, b []float64) (float64, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension not matched")
+	}
+	var sum float64
+	i := 0
+	if hasAVX512 && n >= 32 {
+		acc0, acc1, acc2, acc3 := archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}
+		for i <= n-32 {
+			as, bs := a[i:i+32:i+32], b[i:i+32:i+32]
+			d0 := archsimd.LoadFloat64x8Slice(as[0:8]).Sub(archsimd.LoadFloat64x8Slice(bs[0:8]))
+			d1 := archsimd.LoadFloat64x8Slice(as[8:16]).Sub(archsimd.LoadFloat64x8Slice(bs[8:16]))
+			d2 := archsimd.LoadFloat64x8Slice(as[16:24]).Sub(archsimd.LoadFloat64x8Slice(bs[16:24]))
+			d3 := archsimd.LoadFloat64x8Slice(as[24:32]).Sub(archsimd.LoadFloat64x8Slice(bs[24:32]))
+			acc0 = d0.MulAdd(d0, acc0)
+			acc1 = d1.MulAdd(d1, acc1)
+			acc2 = d2.MulAdd(d2, acc2)
+			acc3 = d3.MulAdd(d3, acc3)
+			i += 32
+		}
+		sum += sumF64x8(acc0.Add(acc1).Add(acc2.Add(acc3)))
+	}
+
+	for i <= n-8 {
+		// BCE Hint
+		as := a[i : i+8 : i+8]
+		bs := b[i : i+8 : i+8]
+		d0 := as[0] - bs[0]
+		d1 := as[1] - bs[1]
+		d2 := as[2] - bs[2]
+		d3 := as[3] - bs[3]
+		d4 := as[4] - bs[4]
+		d5 := as[5] - bs[5]
+		d6 := as[6] - bs[6]
+		d7 := as[7] - bs[7]
+		sum += (d0*d0 + d1*d1) + (d2*d2 + d3*d3) + (d4*d4 + d5*d5) + (d6*d6 + d7*d7)
+		i += 8
+	}
+
+	for ; i < n; i++ {
+		diff := a[i] - b[i]
+		sum += diff * diff
+	}
+	return sum, nil
+}
+
+func InnerProductFloat64(a, b []float64) (float64, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension not matched")
+	}
+	var total float64
+	i := 0
+	if hasAVX512 && n >= 32 {
+		acc0, acc1, acc2, acc3 := archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}
+		for i <= n-32 {
+			as, bs := a[i:i+32:i+32], b[i:i+32:i+32]
+			acc0 = archsimd.LoadFloat64x8Slice(as[0:8]).MulAdd(archsimd.LoadFloat64x8Slice(bs[0:8]), acc0)
+			acc1 = archsimd.LoadFloat64x8Slice(as[8:16]).MulAdd(archsimd.LoadFloat64x8Slice(bs[8:16]), acc1)
+			acc2 = archsimd.LoadFloat64x8Slice(as[16:24]).MulAdd(archsimd.LoadFloat64x8Slice(bs[16:24]), acc2)
+			acc3 = archsimd.LoadFloat64x8Slice(as[24:32]).MulAdd(archsimd.LoadFloat64x8Slice(bs[24:32]), acc3)
+			i += 32
+		}
+		total += sumF64x8(acc0.Add(acc1).Add(acc2.Add(acc3)))
+	}
+
+	for i <= n-8 {
+		// BCE Hint
+		as := a[i : i+8 : i+8]
+		bs := b[i : i+8 : i+8]
+		total += as[0]*bs[0] + as[1]*bs[1] + as[2]*bs[2] + as[3]*bs[3] +
+			as[4]*bs[4] + as[5]*bs[5] + as[6]*bs[6] + as[7]*bs[7]
+		i += 8
+	}
+
+	for ; i < n; i++ {
+		total += a[i] * b[i]
+	}
+	return -total, nil
+}
+
+func L2DistanceSq[T types.RealNumbers](p, q []T) (T, error) {
+	if pf32, ok := any(p).([]float32); ok {
+		res, err := L2DistanceSqFloat32(pf32, any(q).([]float32))
+		return T(res), err
+	}
+	if pf64, ok := any(p).([]float64); ok {
+		res, err := L2DistanceSqFloat64(pf64, any(q).([]float64))
+		return T(res), err
+	}
+	return 0, moerr.NewInternalErrorNoCtx("vector type not supported")
+}
+
+func InnerProduct[T types.RealNumbers](p, q []T) (T, error) {
+	if pf32, ok := any(p).([]float32); ok {
+		res, err := InnerProductFloat32(pf32, any(q).([]float32))
+		return T(res), err
+	}
+	if pf64, ok := any(p).([]float64); ok {
+		res, err := InnerProductFloat64(pf64, any(q).([]float64))
+		return T(res), err
+	}
+	return 0, moerr.NewInternalErrorNoCtx("vector type not supported")
+}
+
+func L1DistanceFloat32(a, b []float32) (float32, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension mismatch")
+	}
+	var sum float32
+	i := 0
+	if hasAVX512 && n >= 64 {
+		acc0, acc1, acc2, acc3 := archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}
+		for i <= n-64 {
+			as, bs := a[i:i+64:i+64], b[i:i+64:i+64]
+			acc0 = acc0.Add(archsimd.LoadFloat32x16Slice(as[0:16]).Sub(archsimd.LoadFloat32x16Slice(bs[0:16])).Max(archsimd.LoadFloat32x16Slice(bs[0:16]).Sub(archsimd.LoadFloat32x16Slice(as[0:16]))))
+			acc1 = acc1.Add(archsimd.LoadFloat32x16Slice(as[16:32]).Sub(archsimd.LoadFloat32x16Slice(bs[16:32])).Max(archsimd.LoadFloat32x16Slice(bs[16:32]).Sub(archsimd.LoadFloat32x16Slice(as[16:32]))))
+			acc2 = acc2.Add(archsimd.LoadFloat32x16Slice(as[32:48]).Sub(archsimd.LoadFloat32x16Slice(bs[32:48])).Max(archsimd.LoadFloat32x16Slice(bs[32:48]).Sub(archsimd.LoadFloat32x16Slice(as[32:48]))))
+			acc3 = acc3.Add(archsimd.LoadFloat32x16Slice(as[48:64]).Sub(archsimd.LoadFloat32x16Slice(bs[48:64])).Max(archsimd.LoadFloat32x16Slice(bs[48:64]).Sub(archsimd.LoadFloat32x16Slice(as[48:64]))))
+			i += 64
+		}
+		sum += sumF32x16(acc0.Add(acc1).Add(acc2.Add(acc3)))
+	}
+
+	abs := func(x float32) float32 {
+		return math.Float32frombits(math.Float32bits(x) &^ (1 << 31))
+	}
+	for i <= n-8 {
+		// BCE Hint
+		as := a[i : i+8 : i+8]
+		bs := b[i : i+8 : i+8]
+		sum += abs(as[0]-bs[0]) + abs(as[1]-bs[1]) + abs(as[2]-bs[2]) + abs(as[3]-bs[3]) +
+			abs(as[4]-bs[4]) + abs(as[5]-bs[5]) + abs(as[6]-bs[6]) + abs(as[7]-bs[7])
+		i += 8
+	}
+
+	for ; i < n; i++ {
+		sum += abs(a[i] - b[i])
+	}
+	return sum, nil
+}
+
+func L1DistanceFloat64(a, b []float64) (float64, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension mismatch")
+	}
+	var sum float64
+	i := 0
+	if hasAVX512 && n >= 32 {
+		acc0, acc1, acc2, acc3 := archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}
+		for i <= n-32 {
+			as, bs := a[i:i+32:i+32], b[i:i+32:i+32]
+			acc0 = acc0.Add(archsimd.LoadFloat64x8Slice(as[0:8]).Sub(archsimd.LoadFloat64x8Slice(bs[0:8])).Max(archsimd.LoadFloat64x8Slice(bs[0:8]).Sub(archsimd.LoadFloat64x8Slice(as[0:8]))))
+			acc1 = acc1.Add(archsimd.LoadFloat64x8Slice(as[8:16]).Sub(archsimd.LoadFloat64x8Slice(bs[8:16])).Max(archsimd.LoadFloat64x8Slice(bs[8:16]).Sub(archsimd.LoadFloat64x8Slice(as[8:16]))))
+			acc2 = acc2.Add(archsimd.LoadFloat64x8Slice(as[16:24]).Sub(archsimd.LoadFloat64x8Slice(bs[16:24])).Max(archsimd.LoadFloat64x8Slice(bs[16:24]).Sub(archsimd.LoadFloat64x8Slice(as[16:24]))))
+			acc3 = acc3.Add(archsimd.LoadFloat64x8Slice(as[24:32]).Sub(archsimd.LoadFloat64x8Slice(bs[24:32])).Max(archsimd.LoadFloat64x8Slice(bs[24:32]).Sub(archsimd.LoadFloat64x8Slice(as[24:32]))))
+			i += 32
+		}
+		sum += sumF64x8(acc0.Add(acc1).Add(acc2.Add(acc3)))
+	}
+
+	abs := func(x float64) float64 {
+		return math.Abs(x)
+	}
+	for i <= n-8 {
+		// BCE Hint
+		as := a[i : i+8 : i+8]
+		bs := b[i : i+8 : i+8]
+		sum += abs(as[0]-bs[0]) + abs(as[1]-bs[1]) + abs(as[2]-bs[2]) + abs(as[3]-bs[3]) +
+			abs(as[4]-bs[4]) + abs(as[5]-bs[5]) + abs(as[6]-bs[6]) + abs(as[7]-bs[7])
+		i += 8
+	}
+
+	for ; i < n; i++ {
+		sum += abs(a[i] - b[i])
+	}
+	return sum, nil
+}
+
+func L1Distance[T types.RealNumbers](p, q []T) (T, error) {
+	if pf32, ok := any(p).([]float32); ok {
+		res, err := L1DistanceFloat32(pf32, any(q).([]float32))
+		return T(res), err
+	}
+	if pf64, ok := any(p).([]float64); ok {
+		res, err := L1DistanceFloat64(pf64, any(q).([]float64))
+		return T(res), err
+	}
+	return 0, moerr.NewInternalErrorNoCtx("vector type not supported")
+}
+
+func CosineDistanceF32(a, b []float32) (float32, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension mismatch")
+	}
+	var dot, normA, normB float32
+	i := 0
+	if n >= 16 && hasAVX512 {
+		accD, accA, accB := archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}
+		for i <= n-16 {
+			va, vb := archsimd.LoadFloat32x16Slice(a[i:i+16]), archsimd.LoadFloat32x16Slice(b[i:i+16])
+			accD = va.MulAdd(vb, accD)
+			accA = va.MulAdd(va, accA)
+			accB = vb.MulAdd(vb, accB)
+			i += 16
+		}
+		dot, normA, normB = sumF32x16(accD), sumF32x16(accA), sumF32x16(accB)
+	}
+
+	for i <= n-4 {
+		// BCE Hint
+		va := a[i : i+4 : i+4]
+		vb := b[i : i+4 : i+4]
+		dot += va[0]*vb[0] + va[1]*vb[1] + va[2]*vb[2] + va[3]*vb[3]
+		normA += va[0]*va[0] + va[1]*va[1] + va[2]*va[2] + va[3]*va[3]
+		normB += vb[0]*vb[0] + vb[1]*vb[1] + vb[2]*vb[2] + vb[3]*vb[3]
+		i += 4
+	}
+
+	for ; i < n; i++ {
+		dot, normA, normB = dot+a[i]*b[i], normA+a[i]*a[i], normB+b[i]*b[i]
+	}
+	den := math.Sqrt(float64(normA)) * math.Sqrt(float64(normB))
+	if den == 0 {
+		return 1.0, nil
+	}
+	return float32(1.0 - float64(dot)/den), nil
+}
+
+func CosineDistanceF64(a, b []float64) (float64, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension mismatch")
+	}
+	var dot, normA, normB float64
+	i := 0
+	if n >= 8 && hasAVX512 {
+		accD, accA, accB := archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}
+		for i <= n-8 {
+			va, vb := archsimd.LoadFloat64x8Slice(a[i:i+8]), archsimd.LoadFloat64x8Slice(b[i:i+8])
+			accD = va.MulAdd(vb, accD)
+			accA = va.MulAdd(va, accA)
+			accB = vb.MulAdd(vb, accB)
+			i += 8
+		}
+		dot, normA, normB = sumF64x8(accD), sumF64x8(accA), sumF64x8(accB)
+	}
+
+	for i <= n-4 {
+		// BCE Hint
+		va := a[i : i+4 : i+4]
+		vb := b[i : i+4 : i+4]
+		dot += va[0]*vb[0] + va[1]*vb[1] + va[2]*vb[2] + va[3]*vb[3]
+		normA += va[0]*va[0] + va[1]*va[1] + va[2]*va[2] + va[3]*va[3]
+		normB += vb[0]*vb[0] + vb[1]*vb[1] + vb[2]*vb[2] + vb[3]*vb[3]
+		i += 4
+	}
+
+	for ; i < n; i++ {
+		dot, normA, normB = dot+a[i]*b[i], normA+a[i]*a[i], normB+b[i]*b[i]
+	}
+	den := math.Sqrt(normA) * math.Sqrt(normB)
+	if den == 0 {
+		return 1.0, nil
+	}
+	return 1.0 - dot/den, nil
+}
+
+func CosineDistance[T types.RealNumbers](p, q []T) (T, error) {
+	if pf32, ok := any(p).([]float32); ok {
+		res, err := CosineDistanceF32(pf32, any(q).([]float32))
+		return T(res), err
+	}
+	if pf64, ok := any(p).([]float64); ok {
+		res, err := CosineDistanceF64(pf64, any(q).([]float64))
+		return T(res), err
+	}
+	return 0, moerr.NewInternalErrorNoCtx("vector type not supported")
+}
+
+func CosineSimilarityF32(a, b []float32) (float32, error) {
+	n := len(a)
+	if n == 0 {
+		return 0, nil
+	}
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension mismatch")
+	}
+	var dot, normA, normB float32
+	i := 0
+	if n >= 16 && hasAVX512 {
+		accD, accA, accB := archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}
+		for i <= n-16 {
+			va, vb := archsimd.LoadFloat32x16Slice(a[i:i+16]), archsimd.LoadFloat32x16Slice(b[i:i+16])
+			accD = va.MulAdd(vb, accD)
+			accA = va.MulAdd(va, accA)
+			accB = vb.MulAdd(vb, accB)
+			i += 16
+		}
+		dot, normA, normB = sumF32x16(accD), sumF32x16(accA), sumF32x16(accB)
+	}
+
+	for i <= n-4 {
+		// BCE Hint
+		va := a[i : i+4 : i+4]
+		vb := b[i : i+4 : i+4]
+		dot += va[0]*vb[0] + va[1]*vb[1] + va[2]*vb[2] + va[3]*vb[3]
+		normA += va[0]*va[0] + va[1]*va[1] + va[2]*va[2] + va[3]*va[3]
+		normB += vb[0]*vb[0] + vb[1]*vb[1] + vb[2]*vb[2] + vb[3]*vb[3]
+		i += 4
+	}
+
+	for ; i < n; i++ {
+		dot, normA, normB = dot+a[i]*b[i], normA+a[i]*a[i], normB+b[i]*b[i]
+	}
+	den := math.Sqrt(float64(normA)) * math.Sqrt(float64(normB))
+	if den == 0 {
+		return 0, moerr.NewInternalErrorNoCtx("cosine similarity zero denominator")
+	}
+	return float32(float64(dot) / den), nil
+}
+
+func CosineSimilarityF64(a, b []float64) (float64, error) {
+	n := len(a)
+	if n == 0 {
+		return 0, nil
+	}
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension mismatch")
+	}
+	var dot, normA, normB float64
+	i := 0
+	if n >= 8 && hasAVX512 {
+		accD, accA, accB := archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}
+		for i <= n-8 {
+			va, vb := archsimd.LoadFloat64x8Slice(a[i:i+8]), archsimd.LoadFloat64x8Slice(b[i:i+8])
+			accD = va.MulAdd(vb, accD)
+			accA = va.MulAdd(va, accA)
+			accB = vb.MulAdd(vb, accB)
+			i += 8
+		}
+		dot, normA, normB = sumF64x8(accD), sumF64x8(accA), sumF64x8(accB)
+	}
+
+	for i <= n-4 {
+		// BCE Hint
+		va := a[i : i+4 : i+4]
+		vb := b[i : i+4 : i+4]
+		dot += va[0]*vb[0] + va[1]*vb[1] + va[2]*vb[2] + va[3]*vb[3]
+		normA += va[0]*va[0] + va[1]*va[1] + va[2]*va[2] + va[3]*va[3]
+		normB += vb[0]*vb[0] + vb[1]*vb[1] + vb[2]*vb[2] + vb[3]*vb[3]
+		i += 4
+	}
+
+	for ; i < n; i++ {
+		dot, normA, normB = dot+a[i]*b[i], normA+a[i]*a[i], normB+b[i]*b[i]
+	}
+	den := math.Sqrt(normA) * math.Sqrt(normB)
+	if den == 0 {
+		return 0, moerr.NewInternalErrorNoCtx("cosine similarity zero denominator")
+	}
+	return dot / den, nil
+}
+
+func CosineSimilarity[T types.RealNumbers](p, q []T) (T, error) {
+	if pf32, ok := any(p).([]float32); ok {
+		res, err := CosineSimilarityF32(pf32, any(q).([]float32))
+		return T(res), err
+	}
+	if pf64, ok := any(p).([]float64); ok {
+		res, err := CosineSimilarityF64(pf64, any(q).([]float64))
+		return T(res), err
+	}
+	return 0, moerr.NewInternalErrorNoCtx("vector type not supported")
+}
+
+func SphericalDistanceFloat32(a, b []float32) (float32, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension not matched")
+	}
+	var total float32
+	i := 0
+	if hasAVX512 && n >= 64 {
+		acc0, acc1, acc2, acc3 := archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}, archsimd.Float32x16{}
+		for i <= n-64 {
+			as, bs := a[i:i+64:i+64], b[i:i+64:i+64]
+			acc0 = archsimd.LoadFloat32x16Slice(as[0:16]).MulAdd(archsimd.LoadFloat32x16Slice(bs[0:16]), acc0)
+			acc1 = archsimd.LoadFloat32x16Slice(as[16:32]).MulAdd(archsimd.LoadFloat32x16Slice(bs[16:32]), acc1)
+			acc2 = archsimd.LoadFloat32x16Slice(as[32:48]).MulAdd(archsimd.LoadFloat32x16Slice(bs[32:48]), acc2)
+			acc3 = archsimd.LoadFloat32x16Slice(as[48:64]).MulAdd(archsimd.LoadFloat32x16Slice(bs[48:64]), acc3)
+			i += 64
+		}
+		total += sumF32x16(acc0.Add(acc1).Add(acc2.Add(acc3)))
+	}
+
+	for i <= n-8 {
+		// BCE Hint
+		as := a[i : i+8 : i+8]
+		bs := b[i : i+8 : i+8]
+		total += as[0]*bs[0] + as[1]*bs[1] + as[2]*bs[2] + as[3]*bs[3] +
+			as[4]*bs[4] + as[5]*bs[5] + as[6]*bs[6] + as[7]*bs[7]
+		i += 8
+	}
+
+	for ; i < n; i++ {
+		total += a[i] * b[i]
+	}
+	if total > 1.0 {
+		total = 1.0
+	} else if total < -1.0 {
+		total = -1.0
+	}
+	return float32(math.Acos(float64(total)) / math.Pi), nil
+}
+
+func SphericalDistanceFloat64(a, b []float64) (float64, error) {
+	n := len(a)
+	if n != len(b) {
+		return 0, moerr.NewInternalErrorNoCtx("vector dimension not matched")
+	}
+	var total float64
+	i := 0
+	if hasAVX512 && n >= 32 {
+		acc0, acc1, acc2, acc3 := archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}, archsimd.Float64x8{}
+		for i <= n-32 {
+			as, bs := a[i:i+32:i+32], b[i:i+32:i+32]
+			acc0 = archsimd.LoadFloat64x8Slice(as[0:8]).MulAdd(archsimd.LoadFloat64x8Slice(bs[0:8]), acc0)
+			acc1 = archsimd.LoadFloat64x8Slice(as[8:16]).MulAdd(archsimd.LoadFloat64x8Slice(bs[8:16]), acc1)
+			acc2 = archsimd.LoadFloat64x8Slice(as[16:24]).MulAdd(archsimd.LoadFloat64x8Slice(bs[16:24]), acc2)
+			acc3 = archsimd.LoadFloat64x8Slice(as[24:32]).MulAdd(archsimd.LoadFloat64x8Slice(bs[24:32]), acc3)
+			i += 32
+		}
+		total += sumF64x8(acc0.Add(acc1).Add(acc2.Add(acc3)))
+	}
+
+	for i <= n-8 {
+		// BCE Hint
+		as := a[i : i+8 : i+8]
+		bs := b[i : i+8 : i+8]
+		total += as[0]*bs[0] + as[1]*bs[1] + as[2]*bs[2] + as[3]*bs[3] +
+			as[4]*bs[4] + as[5]*bs[5] + as[6]*bs[6] + as[7]*bs[7]
+		i += 8
+	}
+
+	for ; i < n; i++ {
+		total += a[i] * b[i]
+	}
+	if total > 1.0 {
+		total = 1.0
+	} else if total < -1.0 {
+		total = -1.0
+	}
+	return math.Acos(total) / math.Pi, nil
+}
+
+func SphericalDistance[T types.RealNumbers](p, q []T) (T, error) {
+	if pf32, ok := any(p).([]float32); ok {
+		res, err := SphericalDistanceFloat32(pf32, any(q).([]float32))
+		return T(res), err
+	}
+	if pf64, ok := any(p).([]float64); ok {
+		res, err := SphericalDistanceFloat64(pf64, any(q).([]float64))
+		return T(res), err
+	}
+	return 0, moerr.NewInternalErrorNoCtx("vector type not supported")
+}
+
+func NormalizeL2[T types.RealNumbers](v1 []T, normalized []T) error {
+	if len(v1) == 0 {
+		return moerr.NewInternalErrorNoCtx("cannot normalize empty vector")
+	}
+	var sumSquares float64
+	for _, val := range v1 {
+		sumSquares += float64(val) * float64(val)
+	}
+	norm := math.Sqrt(sumSquares)
+	if norm == 0 {
+		copy(normalized, v1)
+		return nil
+	}
+	for i, val := range v1 {
+		normalized[i] = T(float64(val) / norm)
+	}
+	return nil
+}
+
+func ScaleInPlace[T types.RealNumbers](v []T, scale T) {
+	for i := range v {
+		v[i] *= scale
+	}
+}
diff --git a/pkg/vectorindex/metric/distance_func_bench_test.go b/pkg/vectorindex/metric/distance_func_bench_test.go
index 506d602d116cd..9a81b4acb6a28 100644
--- a/pkg/vectorindex/metric/distance_func_bench_test.go
+++ b/pkg/vectorindex/metric/distance_func_bench_test.go
@@ -25,10 +25,10 @@ Benchmark_L2Distance/Normalize_L2-10        	                    1277733	      1
 Benchmark_L2Distance/L2_Distance(v1,_NormalizeL2)-10         	     589376	      1883 ns/op
 */
 func Benchmark_L2Distance(b *testing.B) {
-	dim := 128
+	dim := 1024
 
-	b.Run("L2 Distance", func(b *testing.B) {
-		v1, v2 := randomVectors(b.N, dim), randomVectors(b.N, dim)
+	b.Run("L2 Distance float64", func(b *testing.B) {
+		v1, v2 := randomVectors[float64](b.N, dim), randomVectors[float64](b.N, dim)
 		b.ResetTimer()
 
 		for i := 0; i < b.N; i++ {
@@ -36,34 +36,211 @@ func Benchmark_L2Distance(b *testing.B) {
 		}
 	})
 
-	b.Run("Normalize L2", func(b *testing.B) {
-		v1 := randomVectors(b.N, dim)
+	b.Run("L2 Distance float32", func(b *testing.B) {
+		v1, v2 := randomVectors[float32](b.N, dim), randomVectors[float32](b.N, dim)
 		b.ResetTimer()
 
 		for i := 0; i < b.N; i++ {
-			res := make([]float64, dim)
-			_ = NormalizeL2[float64](v1[i], res)
+			_, _ = L2Distance[float32](v1[i], v2[i])
 		}
 	})
 
-	b.Run("L2 Distance(v1, NormalizeL2)", func(b *testing.B) {
-		v1, v2 := randomVectors(b.N, dim), randomVectors(b.N, dim)
+	/*
+		b.Run("Normalize L2 float64", func(b *testing.B) {
+			v1 := randomVectors[float64](b.N, dim)
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				res := make([]float64, dim)
+				_ = NormalizeL2[float64](v1[i], res)
+			}
+		})
+
+		b.Run("Normalize L2 float32", func(b *testing.B) {
+			v1 := randomVectors[float32](b.N, dim)
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				res := make([]float32, dim)
+				_ = NormalizeL2[float32](v1[i], res)
+			}
+		})
+
+		b.Run("L2 Distance(v1, NormalizeL2) float64", func(b *testing.B) {
+			v1, v2 := randomVectors[float64](b.N, dim), randomVectors[float64](b.N, dim)
+			b.ResetTimer()
+
+			for i := 0; i < b.N; i++ {
+				res := make([]float64, dim)
+				_ = NormalizeL2[float64](v2[i], res)
+				_, _ = L2Distance[float64](v1[i], res)
+			}
+		})
+	*/
+}
+
+func Benchmark_L2DistanceSq(b *testing.B) {
+	dim := 1024
+
+	b.Run("L2 DistanceSq float64", func(b *testing.B) {
+		v1, v2 := randomVectors[float64](b.N, dim), randomVectors[float64](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = L2DistanceSq[float64](v1[i], v2[i])
+		}
+	})
+
+	b.Run("L2 DistanceSq float32", func(b *testing.B) {
+		v1, v2 := randomVectors[float32](b.N, dim), randomVectors[float32](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = L2DistanceSq[float32](v1[i], v2[i])
+		}
+	})
+}
+
+func Benchmark_L1Distance(b *testing.B) {
+	dim := 1024
+
+	b.Run("L1 Distance float64", func(b *testing.B) {
+		v1, v2 := randomVectors[float64](b.N, dim), randomVectors[float64](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = L1Distance[float64](v1[i], v2[i])
+		}
+	})
+
+	b.Run("L1 Distance float32", func(b *testing.B) {
+		v1, v2 := randomVectors[float32](b.N, dim), randomVectors[float32](b.N, dim)
 		b.ResetTimer()
 
 		for i := 0; i < b.N; i++ {
-			res := make([]float64, dim)
-			_ = NormalizeL2[float64](v2[i], res)
-			_, _ = L2Distance[float64](v1[i], res)
+			_, _ = L1Distance[float32](v1[i], v2[i])
 		}
 	})
+}
+
+func Benchmark_InnerProduct(b *testing.B) {
+	dim := 1024
+
+	b.Run("Inner Product float64", func(b *testing.B) {
+		v1, v2 := randomVectors[float64](b.N, dim), randomVectors[float64](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = InnerProduct[float64](v1[i], v2[i])
+		}
+	})
+
+	b.Run("Inner Product float32", func(b *testing.B) {
+		v1, v2 := randomVectors[float32](b.N, dim), randomVectors[float32](b.N, dim)
+		b.ResetTimer()
 
+		for i := 0; i < b.N; i++ {
+			_, _ = InnerProduct[float32](v1[i], v2[i])
+		}
+	})
 }
 
-func randomVectors(size, dim int) [][]float64 {
-	vectors := make([][]float64, size)
+func Benchmark_CosineDistance(b *testing.B) {
+	dim := 1024
+
+	b.Run("Cosine Distance float64", func(b *testing.B) {
+		v1, v2 := randomVectors[float64](b.N, dim), randomVectors[float64](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = CosineDistance[float64](v1[i], v2[i])
+		}
+	})
+
+	b.Run("Cosine Distance float32", func(b *testing.B) {
+		v1, v2 := randomVectors[float32](b.N, dim), randomVectors[float32](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = CosineDistance[float32](v1[i], v2[i])
+		}
+	})
+}
+
+func Benchmark_CosineSimilarity(b *testing.B) {
+	dim := 1024
+
+	b.Run("Cosine Similarity float64", func(b *testing.B) {
+		v1, v2 := randomVectors[float64](b.N, dim), randomVectors[float64](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = CosineSimilarity[float64](v1[i], v2[i])
+		}
+	})
+
+	b.Run("Cosine Similarity float32", func(b *testing.B) {
+		v1, v2 := randomVectors[float32](b.N, dim), randomVectors[float32](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = CosineSimilarity[float32](v1[i], v2[i])
+		}
+	})
+}
+
+func Benchmark_SphericalDistance(b *testing.B) {
+	dim := 1024
+
+	b.Run("Spherical Distance float64", func(b *testing.B) {
+		v1, v2 := randomVectors[float64](b.N, dim), randomVectors[float64](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = SphericalDistance[float64](v1[i], v2[i])
+		}
+	})
+
+	b.Run("Spherical Distance float32", func(b *testing.B) {
+		v1, v2 := randomVectors[float32](b.N, dim), randomVectors[float32](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			_, _ = SphericalDistance[float32](v1[i], v2[i])
+		}
+	})
+}
+
+/*
+func Benchmark_ScaleInPlace(b *testing.B) {
+	dim := 1024
+
+	b.Run("ScaleInPlace float64", func(b *testing.B) {
+		v1 := randomVectors[float64](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			ScaleInPlace[float64](v1[i], 0.5)
+		}
+	})
+
+	b.Run("ScaleInPlace float32", func(b *testing.B) {
+		v1 := randomVectors[float32](b.N, dim)
+		b.ResetTimer()
+
+		for i := 0; i < b.N; i++ {
+			ScaleInPlace[float32](v1[i], 0.5)
+		}
+	})
+}
+*/
+
+func randomVectors[T float32 | float64](size, dim int) [][]T {
+	vectors := make([][]T, size)
 	for i := range vectors {
+		vectors[i] = make([]T, dim)
 		for j := 0; j < dim; j++ {
-			vectors[i] = append(vectors[i], rand.Float64())
+			vectors[i][j] = T(rand.Float64())
 		}
 	}
 	return vectors
diff --git a/pkg/vectorindex/metric/distance_func_f32_test.go b/pkg/vectorindex/metric/distance_func_f32_test.go
new file mode 100644
index 0000000000000..098ab6134add8
--- /dev/null
+++ b/pkg/vectorindex/metric/distance_func_f32_test.go
@@ -0,0 +1,583 @@
+// Copyright 2023 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metric
+
+import (
+	"testing"
+
+	"github.com/matrixorigin/matrixone/pkg/common/assertx"
+)
+
+func Test_L2Distance_F32(t *testing.T) {
+	type args struct {
+		v1 []float32
+		v2 []float32
+	}
+	tests := []struct {
+		name string
+		args args
+		want float32
+	}{
+		{
+			name: "Test 1",
+			args: args{
+				v1: []float32{1, 2, 3, 4},
+				v2: []float32{1, 2, 4, 5},
+			},
+			want: 1.4142135623730951,
+		},
+		{
+			name: "Test 2",
+			args: args{
+				v1: []float32{10, 20, 30, 40},
+				v2: []float32{10.5, 21.5, 31.5, 43.5},
+			},
+			want: 4.123105625617661,
+		},
+		{
+			name: "Test 3.a",
+			args: args{
+				v1: []float32{1, 1},
+				v2: []float32{4, 1},
+			},
+			want: 3,
+		},
+		{
+			name: "Test 3.b",
+			args: args{
+				v1: []float32{4, 1},
+				v2: []float32{1, 4},
+			},
+			want: 4.242640687119285,
+		},
+		{
+			name: "Test 3.c",
+			args: args{
+				v1: []float32{1, 4},
+				v2: []float32{1, 1},
+			},
+			want: 3,
+		},
+		{
+			name: "Test 4",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			},
+			want: 3.1622776601683795,
+		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 5.196152422706632,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got, err := L2Distance[float32](tt.args.v1, tt.args.v2); err != nil || got != tt.want {
+				t.Errorf("L2Distance() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_L1Distance_F32(t *testing.T) {
+	type args struct {
+		v1 []float32
+		v2 []float32
+	}
+	tests := []struct {
+		name string
+		args args
+		want float32
+	}{
+		{
+			name: "Test 1",
+			args: args{
+				v1: []float32{1, 2, 3, 4},
+				v2: []float32{1, 2, 4, 5},
+			},
+			want: 2,
+		},
+		{
+			name: "Test 2",
+			args: args{
+				v1: []float32{10, 20, 30, 40},
+				v2: []float32{10.5, 21.5, 31.5, 43.5},
+			},
+			want: 7,
+		},
+		{
+			name: "Test 3.a",
+			args: args{
+				v1: []float32{1, 1},
+				v2: []float32{4, 1},
+			},
+			want: 3,
+		},
+		{
+			name: "Test 3.b",
+			args: args{
+				v1: []float32{4, 1},
+				v2: []float32{1, 4},
+			},
+			want: 6,
+		},
+		{
+			name: "Test 3.c",
+			args: args{
+				v1: []float32{1, 4},
+				v2: []float32{1, 1},
+			},
+			want: 3,
+		},
+		{
+			name: "Test 4",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			},
+			want: 10,
+		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 27,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got, err := L1Distance[float32](tt.args.v1, tt.args.v2); err != nil || got != tt.want {
+				t.Errorf("L1Distance() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_CosineDistance_F32(t *testing.T) {
+	type args struct {
+		v1 []float32
+		v2 []float32
+	}
+	tests := []struct {
+		name string
+		args args
+		want float32
+	}{
+		{
+			name: "Test 1",
+			args: args{
+				v1: []float32{1, 2, 3, 4},
+				v2: []float32{1, 2, 4, 5},
+			},
+			want: 0.003993481192393733,
+		},
+		{
+			name: "Test 2",
+			args: args{
+				v1: []float32{10, 20, 30, 40},
+				v2: []float32{10.5, 21.5, 31.5, 43.5},
+			},
+			want: 0.0001253573895874105,
+		},
+		{
+			name: "Test 3.a",
+			args: args{
+				v1: []float32{1, 1},
+				v2: []float32{4, 1},
+			},
+			want: 0.1425070742874559,
+		},
+		{
+			name: "Test 3.b",
+			args: args{
+				v1: []float32{4, 1},
+				v2: []float32{1, 4},
+			},
+			want: 0.5294117647058824,
+		},
+		{
+			name: "Test 3.c",
+			args: args{
+				v1: []float32{1, 4},
+				v2: []float32{1, 1},
+			},
+			want: 0.1425070742874559,
+		},
+		{
+			name: "Test 4",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			},
+			want: 0.0021238962030426523,
+		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 0.0025062434610066964,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got, err := CosineDistance[float32](tt.args.v1, tt.args.v2); err != nil || got != tt.want {
+				t.Errorf("CosineDistance() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_CosineSimilarity_F32(t *testing.T) {
+	type args struct {
+		v1 []float32
+		v2 []float32
+	}
+	tests := []struct {
+		name string
+		args args
+		want float32
+	}{
+		{
+			name: "Test 1",
+			args: args{
+				v1: []float32{1, 2, 3, 4},
+				v2: []float32{1, 2, 4, 5},
+			},
+			want: 0.9960065188076063,
+		},
+		{
+			name: "Test 2",
+			args: args{
+				v1: []float32{10, 20, 30, 40},
+				v2: []float32{10.5, 21.5, 31.5, 43.5},
+			},
+			want: 0.9998746426104126,
+		},
+		{
+			name: "Test 3.a",
+			args: args{
+				v1: []float32{1, 1},
+				v2: []float32{4, 1},
+			},
+			want: 0.8574929257125441,
+		},
+		{
+			name: "Test 3.b",
+			args: args{
+				v1: []float32{4, 1},
+				v2: []float32{1, 4},
+			},
+			want: 0.47058823529411764,
+		},
+		{
+			name: "Test 3.c",
+			args: args{
+				v1: []float32{1, 4},
+				v2: []float32{1, 1},
+			},
+			want: 0.8574929257125441,
+		},
+		{
+			name: "Test 4",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			},
+			want: 0.9978761037969573,
+		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 0.9974937565389933,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got, err := CosineSimilarity[float32](tt.args.v1, tt.args.v2); err != nil || got != tt.want {
+				t.Errorf("CosineSimilarity() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_InnerProduct_F32(t *testing.T) {
+	type args struct {
+		v1 []float32
+		v2 []float32
+	}
+	tests := []struct {
+		name string
+		args args
+		want float32
+	}{
+		{
+			name: "Test 1",
+			args: args{
+				v1: []float32{1, 2, 3, 4},
+				v2: []float32{1, 2, 4, 5},
+			},
+			want: -37,
+		},
+		{
+			name: "Test 2",
+			args: args{
+				v1: []float32{10, 20, 30, 40},
+				v2: []float32{10.5, 21.5, 31.5, 43.5},
+			},
+			want: -3220,
+		},
+		{
+			name: "Test 3.a",
+			args: args{
+				v1: []float32{1, 1},
+				v2: []float32{4, 1},
+			},
+			want: -5,
+		},
+		{
+			name: "Test 3.b",
+			args: args{
+				v1: []float32{4, 1},
+				v2: []float32{1, 4},
+			},
+			want: -8,
+		},
+		{
+			name: "Test 3.c",
+			args: args{
+				v1: []float32{1, 4},
+				v2: []float32{1, 1},
+			},
+			want: -5,
+		},
+		{
+			name: "Test 4",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			},
+			want: -440,
+		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: -1048,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got, err := InnerProduct[float32](tt.args.v1, tt.args.v2); err != nil || got != tt.want {
+				t.Errorf("InnerProduct() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_L2DistanceSq_F32(t *testing.T) {
+	type args struct {
+		v1 []float32
+		v2 []float32
+	}
+	tests := []struct {
+		name string
+		args args
+		want float32
+	}{
+		{
+			name: "Test 1",
+			args: args{
+				v1: []float32{1, 2, 3, 4},
+				v2: []float32{1, 2, 4, 5},
+			},
+			want: 2,
+		},
+		{
+			name: "Test 2",
+			args: args{
+				v1: []float32{10, 20, 30, 40},
+				v2: []float32{10.5, 21.5, 31.5, 43.5},
+			},
+			want: 17,
+		},
+		{
+			name: "Test 3.a",
+			args: args{
+				v1: []float32{1, 1},
+				v2: []float32{4, 1},
+			},
+			want: 9,
+		},
+		{
+			name: "Test 3.b",
+			args: args{
+				v1: []float32{4, 1},
+				v2: []float32{1, 4},
+			},
+			want: 18,
+		},
+		{
+			name: "Test 3.c",
+			args: args{
+				v1: []float32{1, 4},
+				v2: []float32{1, 1},
+			},
+			want: 9,
+		},
+		{
+			name: "Test 4",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			},
+			want: 10,
+		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 27,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got, err := L2DistanceSq[float32](tt.args.v1, tt.args.v2); err != nil || got != tt.want {
+				t.Errorf("L2DistanceSq() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_AngularDistance_F32(t *testing.T) {
+	type args struct {
+		v1 []float32
+		v2 []float32
+	}
+	tests := []struct {
+		name string
+		args args
+		want float32
+	}{
+		{
+			name: "Test 1",
+			args: args{
+				v1: []float32{1, 2, 3, 4},
+				v2: []float32{1, 2, 4, 5},
+			},
+			want: 0,
+		},
+		{
+			name: "Test 2",
+			args: args{
+				v1: []float32{10, 20, 30, 40},
+				v2: []float32{10.5, 21.5, 31.5, 43.5},
+			},
+			want: 0,
+		},
+		// Test 3:  Triangle Inequality check on **un-normalized** vector
+		// A(1,0),B(2,2), C(0,1) => AB + AC !>= BC => 0 + 0 !>= 0.5
+		{
+			name: "Test 3.a",
+			args: args{
+				v1: []float32{1, 0},
+				v2: []float32{2, 2},
+			},
+			want: 0,
+		},
+		{
+			name: "Test 3.b",
+			args: args{
+				v1: []float32{2, 2},
+				v2: []float32{0, 1},
+			},
+			want: 0,
+		},
+		{
+			name: "Test 3.c",
+			args: args{
+				v1: []float32{0, 1},
+				v2: []float32{1, 0},
+			},
+			want: 0.5,
+		},
+		{
+			name: "Test 4",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			},
+			want: 0,
+		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float32{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 0,
+		},
+
+		// Test 4: Triangle Inequality check on **normalized** vector
+		// A(1,0),B(2,2), C(0,1) => AB + AC >= BC => 0.25 + 0.25 >= 0.5
+		//{
+		//	name: "Test 4.a",
+		//	args: args{
+		//		v1: moarray.NormalizeMoVecf64([]float32{1, 0}),
+		//		v2: moarray.NormalizeMoVecf64([]float32{2, 2}),
+		//	},
+		//	want: 0.25000000000000006,
+		//},
+		//{
+		//	name: "Test 4.b",
+		//	args: args{
+		//		v1: moarray.NormalizeMoVecf64([]float32{2, 2}),
+		//		v2: moarray.NormalizeMoVecf64([]float32{0, 1}),
+		//	},
+		//	want: 0.25000000000000006,
+		//},
+		//{
+		//	name: "Test 4.c",
+		//	args: args{
+		//		v1: moarray.NormalizeMoVecf64([]float32{0, 1}),
+		//		v2: moarray.NormalizeMoVecf64([]float32{1, 0}),
+		//	},
+		//	want: 0.5,
+		//},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+
+			if got, err := SphericalDistance[float32](tt.args.v1, tt.args.v2); err != nil || !assertx.InEpsilonF64(float64(got), float64(tt.want)) {
+				t.Errorf("SphericalDistance() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/pkg/vectorindex/metric/distance_func_test.go b/pkg/vectorindex/metric/distance_func_test.go
index 057e47a2c4e30..4dcaa99f100aa 100644
--- a/pkg/vectorindex/metric/distance_func_test.go
+++ b/pkg/vectorindex/metric/distance_func_test.go
@@ -15,7 +15,6 @@
 package metric
 
 import (
-	"fmt"
 	"math"
 	"testing"
 
@@ -47,10 +46,8 @@ func Test_Blas32(t *testing.T) {
 	distfn, _, err := ResolveKmeansDistanceFn[float32](Metric_L2Distance, false)
 	require.Nil(t, err)
 
-	v, err := distfn(v1.Data, v2.Data)
+	_, err = distfn(v1.Data, v2.Data)
 	require.Nil(t, err)
-
-	fmt.Printf("blas32 v = %v\n", v)
 }
 
 func Test_ResolveFun(t *testing.T) {
@@ -213,6 +210,22 @@ func Test_L2Distance(t *testing.T) {
 			},
 			want: 3.1622776601683795,
 		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 5.196152422706632,
+		},
+		{
+			name: "Test 6",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2},
+			},
+			want: 4.58257569495584,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -281,6 +294,22 @@ func Test_L1Distance(t *testing.T) {
 			},
 			want: 10,
 		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 27,
+		},
+		{
+			name: "Test 6",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2},
+			},
+			want: 21,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -349,6 +378,22 @@ func Test_CosineDistance(t *testing.T) {
 			},
 			want: 0.0021238962030426523,
 		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 0.0025062434610066964,
+		},
+		{
+			name: "Test 6",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2},
+			},
+			want: 0.002478147161370292,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -359,6 +404,90 @@ func Test_CosineDistance(t *testing.T) {
 	}
 }
 
+func Test_CosineSimilarity(t *testing.T) {
+	type args struct {
+		v1 []float64
+		v2 []float64
+	}
+	tests := []struct {
+		name string
+		args args
+		want float64
+	}{
+		{
+			name: "Test 1",
+			args: args{
+				v1: []float64{1, 2, 3, 4},
+				v2: []float64{1, 2, 4, 5},
+			},
+			want: 0.9960065188076063,
+		},
+		{
+			name: "Test 2",
+			args: args{
+				v1: []float64{10, 20, 30, 40},
+				v2: []float64{10.5, 21.5, 31.5, 43.5},
+			},
+			want: 0.9998746426104126,
+		},
+		{
+			name: "Test 3.a",
+			args: args{
+				v1: []float64{1, 1},
+				v2: []float64{4, 1},
+			},
+			want: 0.8574929257125441,
+		},
+		{
+			name: "Test 3.b",
+			args: args{
+				v1: []float64{4, 1},
+				v2: []float64{1, 4},
+			},
+			want: 0.47058823529411764,
+		},
+		{
+			name: "Test 3.c",
+			args: args{
+				v1: []float64{1, 4},
+				v2: []float64{1, 1},
+			},
+			want: 0.8574929257125441,
+		},
+		{
+			name: "Test 4",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+			},
+			want: 0.9978761037969573,
+		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 0.9974937565389933,
+		},
+		{
+			name: "Test 6",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2},
+			},
+			want: 0.9975218528386297,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got, err := CosineSimilarity[float64](tt.args.v1, tt.args.v2); err != nil || got != tt.want {
+				t.Errorf("CosineSimilarity() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
 func Test_InnerProduct(t *testing.T) {
 	type args struct {
 		v1 []float64
@@ -417,6 +546,22 @@ func Test_InnerProduct(t *testing.T) {
 			},
 			want: -440,
 		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: -1048,
+		},
+		{
+			name: "Test 6",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2},
+			},
+			want: -882,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -485,6 +630,22 @@ func Test_L2DistanceSq(t *testing.T) {
 			},
 			want: 10,
 		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 27,
+		},
+		{
+			name: "Test 6",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2},
+			},
+			want: 21,
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -555,6 +716,22 @@ func Test_AngularDistance(t *testing.T) {
 			},
 			want: 0,
 		},
+		{
+			name: "Test 5",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8},
+			},
+			want: 0,
+		},
+		{
+			name: "Test 6",
+			args: args{
+				v1: []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1},
+				v2: []float64{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 2},
+			},
+			want: 0,
+		},
 
 		// Test 4: Triangle Inequality check on **normalized** vector
 		// A(1,0),B(2,2), C(0,1) => AB + AC >= BC => 0.25 + 0.25 >= 0.5
diff --git a/pkg/vectorindex/metric/gpu.go b/pkg/vectorindex/metric/gpu.go
index d0ad025c1f3f0..49284a4c9ac71 100644
--- a/pkg/vectorindex/metric/gpu.go
+++ b/pkg/vectorindex/metric/gpu.go
@@ -17,15 +17,15 @@
 package metric
 
 import (
-	cuvs "github.com/rapidsai/cuvs/go"
+	"github.com/matrixorigin/matrixone/pkg/cuvs"
 )
 
 var (
-	MetricTypeToCuvsMetric = map[MetricType]cuvs.Distance{
-		Metric_L2sqDistance:   cuvs.DistanceSQEuclidean,
-		Metric_L2Distance:     cuvs.DistanceSQEuclidean,
-		Metric_InnerProduct:   cuvs.DistanceInnerProduct,
-		Metric_CosineDistance: cuvs.DistanceCosine,
-		Metric_L1Distance:     cuvs.DistanceL1,
+	MetricTypeToCuvsMetric = map[MetricType]cuvs.DistanceType{
+		Metric_L2sqDistance:   cuvs.L2Expanded,
+		Metric_L2Distance:     cuvs.L2Expanded,
+		Metric_InnerProduct:   cuvs.InnerProduct,
+		Metric_CosineDistance: cuvs.CosineExpanded,
+		Metric_L1Distance:     cuvs.L1,
 	}
 )
diff --git a/pkg/vectorindex/metric/resolve.go b/pkg/vectorindex/metric/resolve.go
new file mode 100644
index 0000000000000..7b0e3ffe239c8
--- /dev/null
+++ b/pkg/vectorindex/metric/resolve.go
@@ -0,0 +1,104 @@
+// Copyright 2023 Matrix Origin
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metric
+
+import (
+	"github.com/matrixorigin/matrixone/pkg/common/moerr"
+	"github.com/matrixorigin/matrixone/pkg/container/types"
+)
+
+// IMPORTANT: Elkans Kmeans always use L2Distance for dense vector or images.  After getting the centroids, we can use other distance function
+// specified by user to assign vector to corresponding centroids (CENTROIDX JOIN / ProductL2).
+
+func ResolveKmeansDistanceFn[T types.RealNumbers](metric MetricType, spherical bool) (DistanceFunction[T], bool, error) {
+	if spherical {
+		return ResolveKmeansDistanceFnForSparse[T](metric)
+	}
+	return ResolveKmeansDistanceFnForDense[T](metric)
+}
+
+func ResolveKmeansDistanceFnForDense[T types.RealNumbers](metric MetricType) (DistanceFunction[T], bool, error) {
+	var distanceFunction DistanceFunction[T]
+	normalize := false
+	switch metric {
+	case Metric_L2Distance:
+		distanceFunction = L2Distance[T]
+		normalize = false
+	case Metric_L2sqDistance:
+		distanceFunction = L2Distance[T]
+		normalize = false
+	case Metric_InnerProduct:
+		distanceFunction = L2Distance[T]
+		normalize = false
+	case Metric_CosineDistance:
+		distanceFunction = L2Distance[T]
+		normalize = false
+	case Metric_L1Distance:
+		distanceFunction = L2Distance[T]
+		normalize = false
+	default:
+		return nil, normalize, moerr.NewInternalErrorNoCtx("invalid distance type")
+	}
+	return distanceFunction, normalize, nil
+}
+
+// IMPORTANT: Spherical Kmeans always use Spherical Distance / Cosine Similarity for Sparse vector or text embedding (TD-IDF).
+// After getting the centroids, we can use other distance function
+// specified by user to assign vector to corresponding centroids (CENTROIDX JOIN / ProductL2).
+func ResolveKmeansDistanceFnForSparse[T types.RealNumbers](metric MetricType) (DistanceFunction[T], bool, error) {
+	var distanceFunction DistanceFunction[T]
+	normalize := false
+	switch metric {
+	case Metric_L2Distance:
+		distanceFunction = L2Distance[T]
+		normalize = false
+	case Metric_L2sqDistance:
+		distanceFunction = L2Distance[T]
+		normalize = false
+	case Metric_InnerProduct:
+		distanceFunction = SphericalDistance[T]
+		normalize = true
+	case Metric_CosineDistance:
+		distanceFunction = SphericalDistance[T]
+		normalize = true
+	case Metric_L1Distance:
+		distanceFunction = L2Distance[T]
+		normalize = false
+	default:
+		return nil, normalize, moerr.NewInternalErrorNoCtx("invalid distance type")
+	}
+	return distanceFunction, normalize, nil
+}
+
+// ResolveDistanceFn is used for similarity score for search and assign vector to centroids (CENTROIDX JOIN / ProductL2).
+// IMPORTANT: Don't use it for Elkans Kmeans
+func ResolveDistanceFn[T types.RealNumbers](metric MetricType) (DistanceFunction[T], error) {
+	var distanceFunction DistanceFunction[T]
+	switch metric {
+	case Metric_L2Distance:
+		distanceFunction = L2DistanceSq[T]
+	case Metric_L2sqDistance:
+		distanceFunction = L2DistanceSq[T]
+	case Metric_InnerProduct:
+		distanceFunction = InnerProduct[T]
+	case Metric_CosineDistance:
+		distanceFunction = CosineDistance[T]
+	case Metric_L1Distance:
+		distanceFunction = L1Distance[T]
+	default:
+		return nil, moerr.NewInternalErrorNoCtx("invalid distance type")
+	}
+	return distanceFunction, nil
+}
diff --git a/test/distributed/cases/array/array_index_knn.result b/test/distributed/cases/array/array_index_knn.result
index 383f668173156..73cbc13e31348 100644
--- a/test/distributed/cases/array/array_index_knn.result
+++ b/test/distributed/cases/array/array_index_knn.result
@@ -57,12 +57,12 @@ insert into t1 values(11, "[1111,1111,1111,1111]", "11");
 insert into t1 values(12, "[1112,1112,1112,1112]", "12");
 insert into t1 values(13, "[1113,1113,1113,1113]", "13");
 alter table t1 alter reindex idx1 ivfflat lists=4;
-select a, b from t1 order by l2_distance(b, "[1,0,0,0]") limit 4;
+select a, b from t1 order by l2_distance(b, "[1,0,0,0]") limit 3;
 a    b
 1    [1, 0, 0, 0]
 2    [2, 0, 0, 0]
 3    [3, 0, 0, 0]
-select a, b from t1 order by l2_distance(b, "[11,11,0,0]") limit 4;
+select a, b from t1 order by l2_distance(b, "[11,11,0,0]") limit 3;
 a    b
 4    [11, 11, 0, 0]
 5    [12, 12, 0, 0]
@@ -128,12 +128,12 @@ insert into t2 values(11, "[1111,1111,1111,1111]", "11", 11);
 insert into t2 values(12, "[1112,1112,1112,1112]", "12", 12);
 insert into t2 values(13, "[1113,1113,1113,1113]", "13", 13);
 alter table t2 alter reindex idx2 ivfflat lists=4;
-select a, b from t2 order by l2_distance(b, "[1,0,0,0]") limit 4;
+select a, b from t2 order by l2_distance(b, "[1,0,0,0]") limit 3;
 a    b
 1    [1, 0, 0, 0]
 2    [2, 0, 0, 0]
 3    [3, 0, 0, 0]
-select a, b from t2 order by l2_distance(b, "[11,11,0,0]") limit 4;
+select a, b from t2 order by l2_distance(b, "[11,11,0,0]") limit 3;
 a    b
 4    [11, 11, 0, 0]
 5    [12, 12, 0, 0]
@@ -188,12 +188,12 @@ insert into t3 values(11, "[1111,1111,1111,1111]", "11");
 insert into t3 values(12, "[1112,1112,1112,1112]", "12");
 insert into t3 values(13, "[1113,1113,1113,1113]", "13");
 alter table t3 alter reindex idx3 ivfflat lists=4;
-select a, b from t3 order by l2_distance(b, "[1,0,0,0]") limit 4;
+select a, b from t3 order by l2_distance(b, "[1,0,0,0]") limit 3;
 a    b
 1    [1, 0, 0, 0]
 2    [2, 0, 0, 0]
 3    [3, 0, 0, 0]
-select a, b from t3 order by l2_distance(b, "[11,11,0,0]") limit 4;
+select a, b from t3 order by l2_distance(b, "[11,11,0,0]") limit 3;
 a    b
 4    [11, 11, 0, 0]
 5    [12, 12, 0, 0]
@@ -254,12 +254,12 @@ a    b
 8    [112, 112, 112, 0]
 6    [13, 13, 0, 0]
 create index idx5 using ivfflat on t5(b) lists=3 op_type "vector_l2_ops";
-select a, b from t5 order by l2_distance(b, "[111,111,111,0]") limit 7;
+select a, b from t5 order by l2_distance(b, "[111,111,111,0]") limit 2;
 a    b
 7    [111, 111, 111, 0]
 8    [112, 112, 112, 0]
 insert into t5 values(11, "[114,114,114,0]", "11");
-select a, b from t5 order by l2_distance(b, "[111,111,111,0]") limit 7;
+select a, b from t5 order by l2_distance(b, "[111,111,111,0]") limit 3;
 a    b
 7    [111, 111, 111, 0]
 8    [112, 112, 112, 0]
diff --git a/test/distributed/cases/array/array_index_knn.sql b/test/distributed/cases/array/array_index_knn.sql
index 20ff7e8ec8fec..9780c9ffcf70b 100644
--- a/test/distributed/cases/array/array_index_knn.sql
+++ b/test/distributed/cases/array/array_index_knn.sql
@@ -49,8 +49,8 @@ insert into t1 values(12, "[1112,1112,1112,1112]", "12");
 insert into t1 values(13, "[1113,1113,1113,1113]", "13");
 
 alter table t1 alter reindex idx1 ivfflat lists=4;
-select a, b from t1 order by l2_distance(b, "[1,0,0,0]") limit 4;
-select a, b from t1 order by l2_distance(b, "[11,11,0,0]") limit 4;
+select a, b from t1 order by l2_distance(b, "[1,0,0,0]") limit 3;
+select a, b from t1 order by l2_distance(b, "[11,11,0,0]") limit 3;
 select a, b from t1 order by l2_distance(b, "[111,111,111,0]") limit 4;
 select a, b from t1 order by l2_distance(b, "[1111,1111,1111,1111]") limit 4;
 
@@ -85,8 +85,8 @@ insert into t2 values(12, "[1112,1112,1112,1112]", "12", 12);
 insert into t2 values(13, "[1113,1113,1113,1113]", "13", 13);
 
 alter table t2 alter reindex idx2 ivfflat lists=4;
-select a, b from t2 order by l2_distance(b, "[1,0,0,0]") limit 4;
-select a, b from t2 order by l2_distance(b, "[11,11,0,0]") limit 4;
+select a, b from t2 order by l2_distance(b, "[1,0,0,0]") limit 3;
+select a, b from t2 order by l2_distance(b, "[11,11,0,0]") limit 3;
 select a, b from t2 order by l2_distance(b, "[111,111,111,0]") limit 4;
 select a, b from t2 order by l2_distance(b, "[1111,1111,1111,1111]") limit 4;
 
@@ -119,8 +119,8 @@ insert into t3 values(12, "[1112,1112,1112,1112]", "12");
 insert into t3 values(13, "[1113,1113,1113,1113]", "13");
 
 alter table t3 alter reindex idx3 ivfflat lists=4;
-select a, b from t3 order by l2_distance(b, "[1,0,0,0]") limit 4;
-select a, b from t3 order by l2_distance(b, "[11,11,0,0]") limit 4;
+select a, b from t3 order by l2_distance(b, "[1,0,0,0]") limit 3;
+select a, b from t3 order by l2_distance(b, "[11,11,0,0]") limit 3;
 select a, b from t3 order by l2_distance(b, "[111,111,111,0]") limit 4;
 select a, b from t3 order by l2_distance(b, "[1111,1111,1111,1111]") limit 4;
 
@@ -175,10 +175,10 @@ create index idx5 using ivfflat on t5(b) lists=3 op_type "vector_l2_ops";
 --|                              0 |                         3 |                  7 | [111, 111, 111, 0]           |
 --|                              0 |                         3 |                  8 | [112, 112, 112, 0]           |
 --+--------------------------------+---------------------------+--------------------+------------------------------+
-select a, b from t5 order by l2_distance(b, "[111,111,111,0]") limit 7;
+select a, b from t5 order by l2_distance(b, "[111,111,111,0]") limit 2;
 
 insert into t5 values(11, "[114,114,114,0]", "11");
-select a, b from t5 order by l2_distance(b, "[111,111,111,0]") limit 7;
+select a, b from t5 order by l2_distance(b, "[111,111,111,0]") limit 3;
 
 -- post
 SET probe_limit = 5;
diff --git a/test/distributed/cases/vector/vector_index.result b/test/distributed/cases/vector/vector_index.result
index 3562fef31f226..d471b0cb7d11c 100644
--- a/test/distributed/cases/vector/vector_index.result
+++ b/test/distributed/cases/vector/vector_index.result
@@ -163,10 +163,12 @@ insert into vector_index_08(d) values ("[8.555,2.11,7.22]");
 alter table vector_index_08 alter reindex idx02 ivfflat lists=3;
 select * from vector_index_08 where a>9774 order by  L2_DISTANCE(d,"[2.36,0.021,9.222]") desc limit 2;
 a    b    c    d
+9778    null    null    [8.555, 2.11, 7.22]
 9777    null    null    [2.36, 5.021, 9.222]
 alter table vector_index_08 rename column d to e;
 select * from vector_index_08 where a>9775 order by  L2_DISTANCE(e,"[8.555,2.11,7.22]") desc limit 2;
 a    b    c    e
+9777    null    null    [2.36, 5.021, 9.222]
 9778    null    null    [8.555, 2.11, 7.22]
 alter table vector_index_08 drop column e;
 select * from vector_index_08;
@@ -295,13 +297,13 @@ a    b    c
 9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4
 select *, cosine_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") as orderbyfn from vector_cos_01 order by orderbyfn ASC LIMIT 2;
 a    b    c    orderbyfn
-9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    0.03196156024932861
+9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    0.03196178004145622
 select *, l2_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") as orderbyfn from vector_cos_01 order by cosine_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") ASC LIMIT 2;
 a    b    c    orderbyfn
 9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    127.42056274414062
 select *, cosine_distance(b, "[2, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") as orderbyfn from vector_cos_01 order by cosine_distance(b, "[1, 15, 15, 0, 5, 7, 5, 5, 4, 0, 0, 0, 28, 1, 12, 5, 75, 20, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]") ASC LIMIT 2;
 a    b    c    orderbyfn
-9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    0.031903373234243526
+9777    [16, 15, 0, 0, 5, 46, 5, 5, 4, 0, 0, 0, 28, 118, 12, 5, 75, 44, 5, 0, 6, 32, 6, 49, 41, 74, 9, 1, 0, 0, 0, 9, 1, 9, 16, 41, 71, 80, 3, 0, 0, 4, 3, 5, 51, 106, 11, 3, 112, 28, 13, 1, 4, 8, 3, 104, 118, 14, 1, 1, 0, 0, 0, 88, 3, 27, 46, 118, 108, 49, 2, 0, 1, 46, 118, 118, 27, 12, 0, 0, 33, 118, 118, 8, 0, 0, 0, 4, 118, 95, 40, 0, 0, 0, 1, 11, 27, 38, 12, 12, 18, 29, 3, 2, 13, 30, 94, 78, 30, 19, 9, 3, 31, 45, 70, 42, 15, 1, 3, 12, 14, 22, 16, 2, 3, 17, 24, 13]    4    0.031903598457574844
 drop table vector_cos_01;
 drop table if exists test_distance_issue;
 create table test_distance_issue (
@@ -321,11 +323,9 @@ CREATE INDEX idx_embedding USING ivfflat ON test_distance_issue(embedding) LISTS
 SELECT id, name, score FROM test_distance_issue
 WHERE score >= 4.0
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]')
-LIMIT 3;
+LIMIT 1;
 id    name    score
 2    Vector B    4.5
-1    Vector A    5.0
-3    Vector C    4.0
 SELECT id, name, score FROM test_distance_issue
 WHERE id IN (1, 2, 3)
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]');
@@ -336,25 +336,20 @@ id    name    score
 SELECT id, name, score FROM test_distance_issue
 WHERE score >= 4.0 AND score < 5.0
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]')
-LIMIT 2;
+LIMIT 1;
 id    name    score
 2    Vector B    4.5
-3    Vector C    4.0
 SELECT id, name, score FROM test_distance_issue
 WHERE score > 3.0 AND score <= 4.5
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]')
-LIMIT 3;
+LIMIT 1;
 id    name    score
 2    Vector B    4.5
-3    Vector C    4.0
-4    Vector D    3.5
 SELECT id, name, score FROM test_distance_issue
 WHERE name LIKE 'Vector%' AND score >= 4.0
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]')
-LIMIT 3;
+LIMIT 1;
 id    name    score
 2    Vector B    4.5
-1    Vector A    5.0
-3    Vector C    4.0
 drop table test_distance_issue;
 SET probe_limit = 5;
diff --git a/test/distributed/cases/vector/vector_index.sql b/test/distributed/cases/vector/vector_index.sql
index 9c4408f079683..88786b4991355 100644
--- a/test/distributed/cases/vector/vector_index.sql
+++ b/test/distributed/cases/vector/vector_index.sql
@@ -238,7 +238,7 @@ CREATE INDEX idx_embedding USING ivfflat ON test_distance_issue(embedding) LISTS
 SELECT id, name, score FROM test_distance_issue
 WHERE score >= 4.0
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]')
-LIMIT 3;
+LIMIT 1;
 
 -- Test 2: Query same IDs directly (baseline comparison)
 SELECT id, name, score FROM test_distance_issue
@@ -249,19 +249,19 @@ ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980
 SELECT id, name, score FROM test_distance_issue
 WHERE score >= 4.0 AND score < 5.0
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]')
-LIMIT 2;
+LIMIT 1;
 
 -- Test 4: Filter with different comparison operators
 SELECT id, name, score FROM test_distance_issue
 WHERE score > 3.0 AND score <= 4.5
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]')
-LIMIT 3;
+LIMIT 1;
 
 -- Test 5: Filter with string column
 SELECT id, name, score FROM test_distance_issue
 WHERE name LIKE 'Vector%' AND score >= 4.0
 ORDER BY l2_distance(embedding, '[0.863103449344635,0.6232981085777283,0.3308980166912079,0.06355834752321243,0.3109823167324066,0.32518333196640015,0.7296061515808105,0.6375574469566345,0.8872127532958984,0.472214937210083,0.11959424614906311,0.7132447957992554,0.7607850432395935,0.5612772107124329,0.7709671854972839,0.49379560351371765]')
-LIMIT 3;
+LIMIT 1;
 
 drop table test_distance_issue;
 
diff --git a/test/distributed/cases/vector/vector_ivf_pre_bloomfilter.result b/test/distributed/cases/vector/vector_ivf_pre_bloomfilter.result
index cb770f1997dd2..99eeb5c18267f 100644
--- a/test/distributed/cases/vector/vector_ivf_pre_bloomfilter.result
+++ b/test/distributed/cases/vector/vector_ivf_pre_bloomfilter.result
@@ -1,37 +1,6 @@
 create database if not exists dd3;
 use dd3;
 set ivf_preload_entries = 0;
-set ivf_small_centroid_threshold = 2;
-set probe_limit  = 1;
-CREATE TABLE vector_test_merge (
-id INT PRIMARY KEY,
-name VARCHAR(100),
-category VARCHAR(50),
-score FLOAT,
-active BOOLEAN DEFAULT true,
-embedding vecf32(16)
-);
-INSERT INTO vector_test_merge (id, name, category, score, active, embedding) VALUES
-(1, 'Item A', 'cat1', 5.0, true, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7]'),
-(2, 'Item B', 'cat1', 4.5, true, '[0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]'),
-(3, 'Item C', 'cat2', 4.0, true, '[0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]'),
-(4, 'Item D', 'cat2', 3.5, false, '[0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1]'),
-(5, 'Item E', 'cat3', 3.0, true, '[0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2]'),
-(6, 'Item F', 'cat3', 2.5, false, '[0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3]'),
-(7, 'Item G', 'cat1', 2.0, true, '[0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4]'),
-(8, 'Item H', 'cat2', 1.5, true, '[0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5]'),
-(9, 'Item I', 'cat3', 1.0, false, '[0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6]'),
-(10, 'Item J', 'cat1', 0.5, true, '[0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1]');
-CREATE INDEX idx_vec_merge USING ivfflat ON vector_test_merge(embedding) lists=4 op_type 'vector_l2_ops';
-SELECT id, name, score FROM vector_test_merge
-WHERE category = 'cat1' AND active = true AND score < 3.0
-ORDER BY l2_distance(embedding,  '[0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1]')
-LIMIT 2 by rank with option 'mode=pre';
-id    name    score
-10    Item J    0.5
-7    Item G    2.0
-set ivf_preload_entries = 0;
-set ivf_small_centroid_threshold = 0;
 set probe_limit  = 5;
 CREATE TABLE vector_test_pre_bf (
 id INT PRIMARY KEY,
@@ -61,7 +30,6 @@ id    name    score
 1    Item A    5.0
 2    Item B    4.5
 set ivf_preload_entries = 1;
-set ivf_small_centroid_threshold = 2;
 set probe_limit  = 5;
 CREATE TABLE vector_test_pre_bf2 (
 id INT PRIMARY KEY,
@@ -91,7 +59,6 @@ id    name    score
 1    Item A    5.0
 2    Item B    4.5
 set ivf_preload_entries = 1;
-set ivf_small_centroid_threshold = 2;
 set probe_limit  = 5;
 CREATE TABLE vector_test_pre_bf3 (
 id INT PRIMARY KEY,
@@ -121,7 +88,6 @@ id    name    score
 1    Item A    5.0
 2    Item B    4.5
 set ivf_preload_entries = 0;
-set ivf_small_centroid_threshold = 0;
 set probe_limit  = 1;
 CREATE TABLE vector_test_pre_bf4 (
 id INT PRIMARY KEY,
@@ -151,9 +117,7 @@ id    name    score
 1    Item A    5.0
 2    Item B    4.5
 set ivf_preload_entries = 0;
-set ivf_small_centroid_threshold = 0;
 set probe_limit  = 5;
-drop table if exists vector_test_merge;
 drop table if exists vector_test_pre_bf;
 drop table if exists vector_test_pre_bf2;
 drop table if exists vector_test_pre_bf3;
diff --git a/test/distributed/cases/vector/vector_ivf_pre_bloomfilter.sql b/test/distributed/cases/vector/vector_ivf_pre_bloomfilter.sql
index b27ab3c39bfb5..f05800adae4f0 100644
--- a/test/distributed/cases/vector/vector_ivf_pre_bloomfilter.sql
+++ b/test/distributed/cases/vector/vector_ivf_pre_bloomfilter.sql
@@ -1,49 +1,9 @@
 create database if not exists dd3;
 use dd3;
 
--- CASE 1: test merge small centroid
-
-set ivf_preload_entries = 0;
-set ivf_small_centroid_threshold = 2;
-set probe_limit  = 1;
-
--- Setup test tables
-CREATE TABLE vector_test_merge (
-    id INT PRIMARY KEY,
-    name VARCHAR(100),
-    category VARCHAR(50),
-    score FLOAT,
-    active BOOLEAN DEFAULT true,
-    embedding vecf32(16)
-);
-
-
--- Insert test data with diverse patterns
-INSERT INTO vector_test_merge (id, name, category, score, active, embedding) VALUES
-(1, 'Item A', 'cat1', 5.0, true, '[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7]'),
-(2, 'Item B', 'cat1', 4.5, true, '[0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]'),
-(3, 'Item C', 'cat2', 4.0, true, '[0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]'),
-(4, 'Item D', 'cat2', 3.5, false, '[0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1]'),
-(5, 'Item E', 'cat3', 3.0, true, '[0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2]'),
-(6, 'Item F', 'cat3', 2.5, false, '[0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3]'),
-(7, 'Item G', 'cat1', 2.0, true, '[0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4]'),
-(8, 'Item H', 'cat2', 1.5, true, '[0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5]'),
-(9, 'Item I', 'cat3', 1.0, false, '[0.9,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.1,0.2,0.3,0.4,0.5,0.6]'),
-(10, 'Item J', 'cat1', 0.5, true, '[0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1]');
-
-CREATE INDEX idx_vec_merge USING ivfflat ON vector_test_merge(embedding) lists=4 op_type 'vector_l2_ops';
-
-SELECT id, name, score FROM vector_test_merge
-WHERE category = 'cat1' AND active = true AND score < 3.0
-ORDER BY l2_distance(embedding,  '[0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1]')
-LIMIT 2 by rank with option 'mode=pre';
-
--- END test merge small centroid
-
 -- CASE 2: test build bloomfilter on the fly
 
 set ivf_preload_entries = 0;
-set ivf_small_centroid_threshold = 0;
 set probe_limit  = 5;
 
 -- Setup test tables
@@ -82,7 +42,6 @@ LIMIT 2 by rank with option 'mode=pre';
 -- CASE 3: test preload entries bloomfilter
 
 set ivf_preload_entries = 1;
-set ivf_small_centroid_threshold = 2;
 set probe_limit  = 5;
 
 -- Setup test tables
@@ -121,7 +80,6 @@ LIMIT 2 by rank with option 'mode=pre';
 -- CASE 4: test pre-filter with NIL centroid
 
 set ivf_preload_entries = 1;
-set ivf_small_centroid_threshold = 2;
 set probe_limit  = 5;
 
 -- Setup test tables
@@ -161,7 +119,6 @@ LIMIT 2 by rank with option 'mode=pre';
 -- CASE 5: test pre-filter with unique join key > #entries in centroids
 
 set ivf_preload_entries = 0;
-set ivf_small_centroid_threshold = 0;
 set probe_limit  = 1;
 
 -- Setup test tables
@@ -199,9 +156,7 @@ LIMIT 2 by rank with option 'mode=pre';
 
 -- Cleanup
 set ivf_preload_entries = 0;
-set ivf_small_centroid_threshold = 0;
 set probe_limit  = 5;
-drop table if exists vector_test_merge;
 drop table if exists vector_test_pre_bf;
 drop table if exists vector_test_pre_bf2;
 drop table if exists vector_test_pre_bf3;
diff --git a/test/distributed/cases/vector/vector_ivf_retry.result b/test/distributed/cases/vector/vector_ivf_retry.result
index a3e5366e4675f..05a4a05a132f6 100644
--- a/test/distributed/cases/vector/vector_ivf_retry.result
+++ b/test/distributed/cases/vector/vector_ivf_retry.result
@@ -9,9 +9,9 @@ insert into t_phase1 values (4, '[1,1,0]', 2);
 insert into t_phase1 values (5, '[1,0,1]', 3);
 create index idx_phase1 using ivfflat on t_phase1(vec) lists=2 op_type 'vector_l2_ops';
 set experimental_ivf_index = 1;
-select id from t_phase1 order by l2_distance(vec, '[0,0,0]') limit 1 by rank with option 'mode=auto';
+select id from t_phase1 order by l2_distance(vec, '[1,0,0]') limit 1 by rank with option 'mode=auto';
 id
-2
+1
 select id from t_phase1 where category = 1 order by l2_distance(vec, '[0,0,0]') limit 1 by rank with option 'mode=auto';
 id
 1
@@ -114,6 +114,7 @@ select id, filter_col from t_retry where filter_col = 1 order by l2_distance(vec
 id    filter_col
 999    1
 drop table t_retry;
+set probe_limit = 2;
 drop table if exists t_edge;
 create table t_edge(id int primary key, vec vecf32(3), status int);
 insert into t_edge values (1, '[1,0,0]', 1);
@@ -128,7 +129,7 @@ id
 1
 select id from t_edge order by l2_distance(vec, '[0,0,0]') limit 2 by rank with option 'mode=auto';
 id
-3
+2
 1
 drop table t_edge;
 drop table if exists t_phase6;
diff --git a/test/distributed/cases/vector/vector_ivf_retry.sql b/test/distributed/cases/vector/vector_ivf_retry.sql
index 786b589908b97..598b909bf4dcc 100644
--- a/test/distributed/cases/vector/vector_ivf_retry.sql
+++ b/test/distributed/cases/vector/vector_ivf_retry.sql
@@ -25,7 +25,7 @@ set experimental_ivf_index = 1;
 
 -- Test 1.1: mode=auto syntax is accepted
 -- Expectation: Returns closest vector to [0,0,0]
-select id from t_phase1 order by l2_distance(vec, '[0,0,0]') limit 1 by rank with option 'mode=auto';
+select id from t_phase1 order by l2_distance(vec, '[1,0,0]') limit 1 by rank with option 'mode=auto';
 
 -- Test 1.2: mode=auto with filter
 -- Expectation: Returns id 1 or 2 (category=1, closest to [0,0,0])
@@ -201,6 +201,7 @@ drop table t_retry;
 -- Edge Cases and Boundary Tests
 -- =============================================================================
 
+set probe_limit = 2;
 drop table if exists t_edge;
 create table t_edge(id int primary key, vec vecf32(3), status int);