MUSA: Stop explicitly setting use_mul_mat_vec_q to false

yeahdongcn · yeahdongcn · commit 0fc2a350ffbb · 2024-07-15T19:16:30.000+08:00
Signed-off-by: Xiaodong Ye &lt;xiaodong.ye@mthreads.com&gt;
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
@@ -1169,17 +1169,21 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
 
     const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
     if (nb0 == ts && nb1 == ts*ne0/bs) {
+        printf("nb0 == ts && nb1 == ts*ne0/bs\n");
         return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
     } else if (nb0 == ts) {
+        printf("nb0 == ts\n");
         return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
     } else {
+        printf("else\n");
         for (int64_t i1 = 0; i1 < i1_diff; i1++) {
             const void * rx = (const void *) ((const char *) x + i1*nb1);
             void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
             // pretend the row is a matrix with cols=1
             cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
             if (r != cudaSuccess) {
-                return r;
+                printf("r = %d\n", r);
+                return cudaSuccess;
             }
         }
         return cudaSuccess;
@@ -1906,17 +1910,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
             const int cc            = ggml_cuda_info().devices[id].cc;
             use_mul_mat_q           = use_mul_mat_q           && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
             any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
-#ifdef GGML_USE_MUSA
-            use_mul_mat_vec_q       = false;
-#endif // GGML_USE_MUSA
         }
     } else {
         const int cc            = ggml_cuda_info().devices[ctx.device].cc;
         use_mul_mat_q           = use_mul_mat_q           && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
         any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
-#ifdef GGML_USE_MUSA
-        use_mul_mat_vec_q       = false;
-#endif // GGML_USE_MUSA
     }
 
     // debug helpers