@@ -4044,7 +4044,7 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
4044
4044
cpy_1 (cx + x_offset, cdst + dst_offset);
4045
4045
}
4046
4046
4047
- template <bool first_incomplete, bool last_incomplete>
4047
+ template <bool first_incomplete, bool last_incomplete, bool save_unquantized >
4048
4048
static __global__ void cpy_f32_q8_0 (
4049
4049
const char * cx, char * cdst, const int i_blck_0, const int ne00, const int ne01, const int ne02,
4050
4050
const int nb00, const int nb01, const int nb02, const int nb11, const int nb12) {
@@ -4075,7 +4075,7 @@ static __global__ void cpy_f32_q8_0(
4075
4075
val = *((float *) src);
4076
4076
}
4077
4077
4078
- if (last_incomplete && i0 / QK8_0 == (i_blck_0 + ne00) / QK8_0) {
4078
+ if (save_unquantized && last_incomplete && i0 / QK8_0 == (i_blck_0 + ne00) / QK8_0) {
4079
4079
memcpy (&dst[1 + iqs/8 ].qs [sizeof (float ) * (iqs % 8 )], src, sizeof (float ));
4080
4080
}
4081
4081
@@ -5114,7 +5114,7 @@ static void ggml_cpy_f32_f16_cuda(
5114
5114
5115
5115
static void ggml_cpy_f32_q8_0_cuda (
5116
5116
const char * cx, char * cdst, const int i_blck_0, const int ne00, const int ne01, const int ne02,
5117
- const int nb00, const int nb01, const int nb02, const int nb11, const int nb12, cudaStream_t stream) {
5117
+ const int nb00, const int nb01, const int nb02, const int nb11, const int nb12, const bool pad, cudaStream_t stream) {
5118
5118
5119
5119
const int num_blocks_x = (i_blck_0 + ne00 + WARP_SIZE - 1 ) / WARP_SIZE;
5120
5120
const dim3 block_nums (num_blocks_x, ne01, ne02);
@@ -5125,17 +5125,27 @@ static void ggml_cpy_f32_q8_0_cuda(
5125
5125
5126
5126
if (first_incomplete && last_incomplete) {
5127
5127
GGML_ASSERT (i_blck_0 + ne00 < QK8_0); // otherwise there would be a race condition
5128
- cpy_f32_q8_0<true , true ><<<block_nums, block_dims, 0 , stream>>>
5128
+ GGML_ASSERT (pad == false );
5129
+ cpy_f32_q8_0<true , true , false ><<<block_nums, block_dims, 0 , stream>>>
5129
5130
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5130
5131
} else if (first_incomplete && !last_incomplete) {
5131
- cpy_f32_q8_0<true , false ><<<block_nums, block_dims, 0 , stream>>>
5132
+ GGML_ASSERT (pad == false );
5133
+ cpy_f32_q8_0<true , false , false ><<<block_nums, block_dims, 0 , stream>>>
5132
5134
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5133
- } else if (!first_incomplete && last_incomplete) {
5134
- cpy_f32_q8_0<false , true ><<<block_nums, block_dims, 0 , stream>>>
5135
+ } else if (!first_incomplete && last_incomplete && pad ) {
5136
+ cpy_f32_q8_0<false , true , false ><<<block_nums, block_dims, 0 , stream>>>
5135
5137
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5136
- } else if (!first_incomplete && ! last_incomplete) {
5137
- cpy_f32_q8_0<false , false ><<<block_nums, block_dims, 0 , stream>>>
5138
+ } else if (!first_incomplete && last_incomplete && !pad ) {
5139
+ cpy_f32_q8_0<false , true , true ><<<block_nums, block_dims, 0 , stream>>>
5138
5140
(cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5141
+ } else if (!first_incomplete && !last_incomplete && pad) {
5142
+ cpy_f32_q8_0<false , false , true ><<<block_nums, block_dims, 0 , stream>>>
5143
+ (cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5144
+ } else if (!first_incomplete && !last_incomplete && !pad) {
5145
+ cpy_f32_q8_0<false , false , true ><<<block_nums, block_dims, 0 , stream>>>
5146
+ (cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
5147
+ } else {
5148
+ GGML_ASSERT (false );
5139
5149
}
5140
5150
}
5141
5151
@@ -6626,9 +6636,6 @@ void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_te
6626
6636
}
6627
6637
6628
6638
void ggml_cuda_cpy (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
6629
- const int64_t ne = ggml_nelements (src0);
6630
- GGML_ASSERT (ne == ggml_nelements (src1));
6631
-
6632
6639
GGML_ASSERT (src0->backend == GGML_BACKEND_GPU);
6633
6640
GGML_ASSERT (src1->backend == GGML_BACKEND_GPU);
6634
6641
@@ -6652,6 +6659,16 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6652
6659
const int64_t nb11 = src1->nb [1 ];
6653
6660
const int64_t nb12 = src1->nb [2 ];
6654
6661
6662
+ const int64_t blck_size = ggml_blck_size (src1->type );
6663
+ const int64_t ne00_padded = ((ne00 + blck_size - 1 ) / blck_size) * blck_size;
6664
+ const int64_t ne = ggml_nelements (src0);
6665
+ const bool pad = dst->op_params [0 ] & 1 ;
6666
+ if (pad) {
6667
+ GGML_ASSERT (ne00_padded * ggml_nrows (src0) == ggml_nelements (src1));
6668
+ } else {
6669
+ GGML_ASSERT (ne == ggml_nelements (src1));
6670
+ }
6671
+
6655
6672
CUDA_CHECK (cudaSetDevice (g_main_device));
6656
6673
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
6657
6674
@@ -6670,16 +6687,19 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
6670
6687
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
6671
6688
GGML_ASSERT (nb10 == sizeof (block_q8_0));
6672
6689
6673
- const size_t * op_params = (const size_t *) src1->op_params ;
6674
- const size_t i_blck_0 = op_params[1 ];
6690
+ size_t i_blck_0 = 0 ;
6691
+ if (src1->op == GGML_OP_VIEW) {
6692
+ const size_t * op_params = (const size_t *) src1->op_params ;
6693
+ i_blck_0 = op_params[1 ];
6694
+ }
6675
6695
6676
6696
if (ggml_is_contiguous (src1)) {
6677
6697
ggml_cpy_f32_q8_0_cuda (
6678
6698
src0_ddc, src1_ddc, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02,
6679
- ne00 *sizeof (block_q8_0)/QK8_0, ne00 *ne01*sizeof (block_q8_0)/QK8_0, cudaStream_main);
6699
+ ne00_padded *sizeof (block_q8_0)/QK8_0, ne00_padded *ne01*sizeof (block_q8_0)/QK8_0, pad , cudaStream_main);
6680
6700
} else {
6681
6701
ggml_cpy_f32_q8_0_cuda (src0_ddc, src1_ddc, i_blck_0, ne00, ne01, ne02,
6682
- nb00, nb01, nb02, nb11, nb12, cudaStream_main);
6702
+ nb00, nb01, nb02, nb11, nb12, pad, cudaStream_main);
6683
6703
}
6684
6704
6685
6705
} else {
0 commit comments