@@ -5247,7 +5247,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
5247
5247
if (src->backend == GGML_BACKEND_CPU) {
5248
5248
kind = cudaMemcpyHostToDevice;
5249
5249
src_ptr = (char *) src->data ;
5250
- } else if (src->backend == GGML_BACKEND_GPU) {
5250
+ } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
5251
+ GGML_ASSERT (src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne [1 ]));
5251
5252
kind = cudaMemcpyDeviceToDevice;
5252
5253
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra ;
5253
5254
int id;
@@ -5289,9 +5290,7 @@ inline void ggml_cuda_op_add(
5289
5290
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5290
5291
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
5291
5292
5292
- GGML_ASSERT (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
5293
5293
GGML_ASSERT (src1->type == GGML_TYPE_F32);
5294
- GGML_ASSERT ( dst->type == GGML_TYPE_F32);
5295
5294
5296
5295
const int64_t ne10 = src1->ne [0 ];
5297
5296
const int64_t ne11 = src1->ne [1 ];
@@ -5631,10 +5630,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
5631
5630
const int64_t ne0 = dst->ne [0 ];
5632
5631
const int64_t row_diff = row_high - row_low;
5633
5632
5634
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
5635
- size_t src0_as;
5636
- float * src0_ddf_i = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_as);
5637
- to_fp32_cuda (src0_dd_i, src0_ddf_i, row_diff*ne00, stream);
5633
+ float * src0_ddq_as_f32;
5634
+ size_t src0_as = 0 ;
5635
+
5636
+ if (src0->type != GGML_TYPE_F32) {
5637
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
5638
+ src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_as); // NOLINT
5639
+ to_fp32_cuda (src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
5640
+ }
5641
+ const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
5638
5642
5639
5643
int id;
5640
5644
CUDA_CHECK (cudaGetDevice (&id));
@@ -5651,10 +5655,11 @@ inline void ggml_cuda_op_mul_mat_cublas(
5651
5655
src1_ddf_i, ne10,
5652
5656
&beta, dst_dd_i, ldc));
5653
5657
5654
- ggml_cuda_pool_free (src0_ddf_i, src0_as);
5658
+ if (src0_as > 0 ) {
5659
+ ggml_cuda_pool_free (src0_ddq_as_f32, src0_as);
5660
+ }
5655
5661
5656
5662
(void ) dst;
5657
- (void ) src0_dd_i;
5658
5663
(void ) src1_ddq_i;
5659
5664
(void ) src1_padded_row_size;
5660
5665
}
@@ -5793,7 +5798,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
5793
5798
const bool use_src1 = src1 != nullptr ;
5794
5799
const int64_t nrows1 = use_src1 ? ggml_nrows (src1) : 1 ;
5795
5800
5796
- GGML_ASSERT ( src0->backend != GGML_BACKEND_GPU_SPLIT);
5797
5801
GGML_ASSERT (!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
5798
5802
GGML_ASSERT ( dst->backend != GGML_BACKEND_GPU_SPLIT);
5799
5803
0 commit comments