Skip to content

Commit 9517fce

Browse files
CUDA: fix tensor core logic for Pascal and HIP
1 parent 65e5f6d commit 9517fce

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

ggml-cuda.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@
134134
// TODO: improve this to be correct for more hardware
135135
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
136136
// probably other such cases, and not sure what happens on AMD hardware
137-
#if !defined(GGML_CUDA_FORCE_MMQ)
137+
#if !defined(GGML_CUDA_FORCE_MMQ) && !defined(GGML_USE_HIPBLAS)
138138
#define CUDA_USE_TENSOR_CORES
139139
#endif
140140

@@ -8663,7 +8663,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
86638663
}
86648664

86658665
#ifdef CUDA_USE_TENSOR_CORES
8666-
const bool use_tensor_cores = true;
8666+
const bool use_tensor_cores = min_compute_capability >= CC_VOLTA;
86678667
#else
86688668
const bool use_tensor_cores = false;
86698669
#endif
@@ -8706,7 +8706,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
87068706

87078707
// when tensor cores are available, use them for large batch size
87088708
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
8709-
if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
8709+
if (use_tensor_cores && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
87108710
use_mul_mat_q = false;
87118711
}
87128712

0 commit comments

Comments
 (0)