File tree 2 files changed +3
-3
lines changed 2 files changed +3
-3
lines changed Original file line number Diff line number Diff line change @@ -280,8 +280,8 @@ if (LLAMA_CUBLAS)
280
280
# 52 == lowest CUDA 12 standard
281
281
# 60 == f16 CUDA intrinsics
282
282
# 61 == integer CUDA intrinsics
283
- # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
284
- if (LLAMA_CUDA_DMMV_F16)
283
+ # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
284
+ if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
285
285
set (CMAKE_CUDA_ARCHITECTURES "60;61;70" ) # needed for f16 CUDA intrinsics
286
286
else ()
287
287
set (CMAKE_CUDA_ARCHITECTURES "52;61;70" ) # lowest CUDA 12 standard + lowest for integer intrinsics
Original file line number Diff line number Diff line change @@ -2877,7 +2877,7 @@ static __global__ void mul_mat_q(
2877
2877
2878
2878
__syncthreads ();
2879
2879
2880
- #if __CUDA_ARCH__ >= 700 // TODO: actually test this with compute capability 7.X cards
2880
+ #if __CUDA_ARCH__ >= 700 // Unrolling the loop is slower on Pascal
2881
2881
#pragma unroll
2882
2882
#endif // __CUDA_ARCH__ >= 700
2883
2883
for (int k = 0 ; k < WARP_SIZE/vdr; ++k) {
You can’t perform that action at this time.
0 commit comments