@@ -1169,17 +1169,21 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
1169
1169
1170
1170
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
1171
1171
if (nb0 == ts && nb1 == ts*ne0/bs) {
1172
+ printf (" nb0 == ts && nb1 == ts*ne0/bs\n " );
1172
1173
return cudaMemcpyAsync (dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream);
1173
1174
} else if (nb0 == ts) {
1175
+ printf (" nb0 == ts\n " );
1174
1176
return cudaMemcpy2DAsync (dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream);
1175
1177
} else {
1178
+ printf (" else\n " );
1176
1179
for (int64_t i1 = 0 ; i1 < i1_diff; i1++) {
1177
1180
const void * rx = (const void *) ((const char *) x + i1*nb1);
1178
1181
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
1179
1182
// pretend the row is a matrix with cols=1
1180
1183
cudaError_t r = cudaMemcpy2DAsync (rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream);
1181
1184
if (r != cudaSuccess) {
1182
- return r;
1185
+ printf (" r = %d\n " , r);
1186
+ return cudaSuccess;
1183
1187
}
1184
1188
}
1185
1189
return cudaSuccess;
@@ -1906,17 +1910,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1906
1910
const int cc = ggml_cuda_info ().devices [id].cc ;
1907
1911
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1908
1912
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available (cc);
1909
- #ifdef GGML_USE_MUSA
1910
- use_mul_mat_vec_q = false ;
1911
- #endif // GGML_USE_MUSA
1912
1913
}
1913
1914
} else {
1914
1915
const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
1915
1916
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1916
1917
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available (cc);
1917
- #ifdef GGML_USE_MUSA
1918
- use_mul_mat_vec_q = false ;
1919
- #endif // GGML_USE_MUSA
1920
1918
}
1921
1919
1922
1920
// debug helpers
0 commit comments