Fix LLAMA_CUDA_F16 check

JohannesGaessler · JohannesGaessler · commit dd0e0d975757 · 2023-08-01T16:09:47.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -280,8 +280,8 @@ if (LLAMA_CUBLAS)
         # 52 == lowest CUDA 12 standard
         # 60 == f16 CUDA intrinsics
         # 61 == integer CUDA intrinsics
-        # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (LLAMA_CUDA_DMMV_F16)
+        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
             set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
         else()
             set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -2877,7 +2877,7 @@ static __global__ void mul_mat_q(
 
         __syncthreads();
 
-#if __CUDA_ARCH__ >= 700 // TODO: actually test this with compute capability 7.X cards
+#if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
 #pragma unroll
 #endif // __CUDA_ARCH__ >= 700
         for (int k = 0; k < WARP_SIZE/vdr; ++k) {