@@ -7830,6 +7830,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
7830
7830
}
7831
7831
7832
7832
#ifdef NDEBUG
7833
+ for (int id = 0 ; id < g_device_count; ++id) {
7834
+ CUDA_CHECK (ggml_cuda_set_device (id));
7835
+ CUDA_CHECK (cudaDeviceSynchronize ());
7836
+ }
7837
+
7833
7838
for (int id = 0 ; id < g_device_count; ++id) {
7834
7839
CUDA_CHECK (ggml_cuda_set_device (id));
7835
7840
@@ -7881,8 +7886,6 @@ static void ggml_cuda_op_mul_mat(
7881
7886
const int nb2 = dst->nb [2 ];
7882
7887
const int nb3 = dst->nb [3 ];
7883
7888
7884
- ggml_cuda_set_peer_access (ne11);
7885
-
7886
7889
GGML_ASSERT (dst->backend != GGML_BACKEND_GPU_SPLIT);
7887
7890
GGML_ASSERT (src1->backend != GGML_BACKEND_GPU_SPLIT);
7888
7891
@@ -8781,16 +8784,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8781
8784
8782
8785
GGML_ASSERT (dst->backend == GGML_BACKEND_GPU);
8783
8786
8787
+ const int64_t nb11 = src1->nb [1 ];
8788
+ const int64_t nb1 = dst->nb [1 ];
8789
+
8784
8790
const struct ggml_tensor * ids = src0;
8785
8791
const int32_t id = ((int32_t *) dst->op_params )[0 ];
8786
8792
const int32_t n_as = ((int32_t *) dst->op_params )[1 ];
8787
8793
8788
8794
std::vector<char > ids_host (ggml_nbytes (ids));
8789
8795
8796
+ const cudaStream_t stream = g_cudaStreams[g_main_device][0 ];
8797
+
8790
8798
if (ids->backend == GGML_BACKEND_GPU) {
8791
8799
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra )->data_device [g_main_device];
8792
- CUDA_CHECK (cudaMemcpyAsync (ids_host.data (), ids_dev, ggml_nbytes (ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][ 0 ] ));
8793
- CUDA_CHECK (cudaStreamSynchronize (g_cudaStreams[g_main_device][ 0 ] ));
8800
+ CUDA_CHECK (cudaMemcpyAsync (ids_host.data (), ids_dev, ggml_nbytes (ids), cudaMemcpyDeviceToHost, stream ));
8801
+ CUDA_CHECK (cudaStreamSynchronize (stream ));
8794
8802
} else {
8795
8803
memcpy (ids_host.data (), ids->data , ggml_nbytes (ids));
8796
8804
}
@@ -8804,37 +8812,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8804
8812
ggml_tensor src1_row = *src1;
8805
8813
ggml_tensor dst_row = *dst;
8806
8814
8807
- src1_row.ne [ 1 ] = 1 ;
8808
- dst_row.ne [ 1 ] = 1 ;
8815
+ src1_row.extra = &src1_row_extra ;
8816
+ dst_row.extra = &dst_row_extra ;
8809
8817
8810
- src1_row. nb [ 2 ] = src1_row. nb [ 1 ];
8811
- dst_row. nb [ 2 ] = dst_row. nb [ 1 ];
8818
+ char * src1_original = ( char *) src1_extra-> data_device [g_main_device ];
8819
+ char * dst_original = ( char *) dst_extra-> data_device [g_main_device ];
8812
8820
8813
- src1_row.nb [3 ] = src1_row.nb [1 ];
8814
- dst_row.nb [3 ] = dst_row.nb [1 ];
8821
+ if (src1->ne [1 ] == 1 ) {
8822
+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8823
+ // int32_t row_id;
8824
+ // CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8825
+ // CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8815
8826
8816
- src1_row.extra = &src1_row_extra;
8817
- dst_row.extra = &dst_row_extra;
8827
+ const int32_t row_id = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8818
8828
8829
+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
8819
8830
8820
- for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8821
- // int32_t row_id;
8822
- // CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8823
- // CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8831
+ const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8824
8832
8825
- const int32_t row_id = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8833
+ src1_row_extra.data_device [g_main_device] = src1_original + i01*src1->nb [1 ];
8834
+ src1_row.data = (char *) src1->data + i01*src1->nb [1 ]; // TODO why is this set?
8826
8835
8827
- GGML_ASSERT (row_id >= 0 && row_id < n_as);
8836
+ dst_row_extra.data_device [g_main_device] = dst_original + i01*dst->nb [1 ];
8837
+ dst_row.data = (char *) dst->data + i01*dst->nb [1 ]; // TODO why is this set?
8828
8838
8829
- const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8839
+ ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8840
+ }
8841
+ } else {
8842
+ size_t as_src1, as_dst;
8843
+ char * src1_contiguous = (char *) ggml_cuda_pool_malloc (sizeof (float )*ggml_nelements (src1), &as_src1);
8844
+ char * dst_contiguous = (char *) ggml_cuda_pool_malloc (sizeof (float )*ggml_nelements (dst), &as_dst);
8830
8845
8831
- src1_row_extra.data_device [g_main_device] = (char *) src1_extra->data_device [g_main_device] + i01*src1->nb [1 ];
8832
- src1_row.data = (char *) src1->data + i01*src1->nb [1 ];
8846
+ src1_row_extra.data_device [g_main_device] = src1_contiguous;
8847
+ dst_row_extra.data_device [g_main_device] = dst_contiguous;
8848
+
8849
+ for (int32_t row_id = 0 ; row_id < n_as; ++row_id) {
8850
+ const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8851
+
8852
+ int64_t num_src1_rows = 0 ;
8853
+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8854
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8855
+
8856
+ if (row_id_i != row_id) {
8857
+ continue ;
8858
+ }
8833
8859
8834
- dst_row_extra.data_device [g_main_device] = (char *) dst_extra->data_device [g_main_device] + i01*dst->nb [1 ];
8835
- dst_row.data = (char *) dst->data + i01*dst->nb [1 ];
8860
+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
8836
8861
8837
- ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8862
+ CUDA_CHECK (cudaMemcpyAsync (src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8863
+ nb11, cudaMemcpyDeviceToDevice, stream));
8864
+ num_src1_rows++;
8865
+ }
8866
+
8867
+ if (num_src1_rows == 0 ) {
8868
+ continue ;
8869
+ }
8870
+
8871
+ src1_row.ne [1 ] = num_src1_rows;
8872
+ dst_row.ne [1 ] = num_src1_rows;
8873
+
8874
+ src1_row.nb [1 ] = nb11;
8875
+ src1_row.nb [2 ] = num_src1_rows*nb11;
8876
+ src1_row.nb [3 ] = num_src1_rows*nb11;
8877
+
8878
+ dst_row.nb [1 ] = nb1;
8879
+ dst_row.nb [2 ] = num_src1_rows*nb1;
8880
+ dst_row.nb [3 ] = num_src1_rows*nb1;
8881
+
8882
+ ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8883
+
8884
+ num_src1_rows = 0 ;
8885
+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8886
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8887
+
8888
+ if (row_id_i != row_id) {
8889
+ continue ;
8890
+ }
8891
+
8892
+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
8893
+
8894
+ CUDA_CHECK (cudaMemcpyAsync (dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8895
+ nb1, cudaMemcpyDeviceToDevice, stream));
8896
+ num_src1_rows++;
8897
+ }
8898
+ }
8899
+
8900
+ ggml_cuda_pool_free (src1_contiguous, as_src1);
8901
+ ggml_cuda_pool_free (dst_contiguous, as_dst);
8838
8902
}
8839
8903
}
8840
8904
@@ -9027,7 +9091,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9027
9091
}
9028
9092
9029
9093
void ggml_cuda_free_data (struct ggml_tensor * tensor) {
9030
- if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9094
+ if (!tensor || !tensor-> extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9031
9095
return ;
9032
9096
}
9033
9097
@@ -9369,6 +9433,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9369
9433
return false ;
9370
9434
}
9371
9435
9436
+ if (tensor->src [0 ] != nullptr && tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT) {
9437
+ ggml_cuda_set_peer_access (tensor->src [1 ]->ne [1 ]);
9438
+ }
9439
+
9372
9440
if (params->ith != 0 ) {
9373
9441
return true ;
9374
9442
}
0 commit comments