Skip to content

Commit 3d01122

Browse files
ikawrakowKawrakow
andauthored
CUDA : faster k-quant dot kernels (#1862)
* cuda : faster k-quant dot kernels * Imrove Q2_K dot kernel on older GPUs We now have a K_QUANTS_PER_ITERATION macro, which should be set to 1 on older and to 2 on newer GPUs. With this, we preserve the performance of the original PR on RTX-4080, and are faster compared to master on GTX-1660. * Imrove Q6_K dot kernel on older GPUs Using the same K_QUANTS_PER_ITERATION macro as last commit, we preserve performance on RTX-4080 and speed up Q6_K on a GTX-1660. * Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile Allowed values are 1 or 2. 2 gives the best performance on modern GPUs and is set as default. On older GPUs 1 may work better. * PR comments --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 602c748 commit 3d01122

File tree

3 files changed

+385
-221
lines changed

3 files changed

+385
-221
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
7070
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
7171
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
7272
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
73+
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
7374
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
7475
option(LLAMA_METAL "llama: use Metal" OFF)
7576
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
@@ -201,6 +202,7 @@ if (LLAMA_CUBLAS)
201202
add_compile_definitions(GGML_USE_CUBLAS)
202203
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
203204
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
205+
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
204206

205207
if (LLAMA_STATIC)
206208
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ ifdef LLAMA_CUDA_DMMV_Y
171171
else
172172
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
173173
endif # LLAMA_CUDA_DMMV_Y
174+
ifdef LLAMA_CUDA_KQUANTS_ITER
175+
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
176+
else
177+
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
178+
endif
174179
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
175180
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
176181
endif # LLAMA_CUBLAS

0 commit comments

Comments
 (0)