@@ -7820,7 +7820,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
7820
7820
}
7821
7821
}
7822
7822
7823
- static void ggml_cuda_set_peer_access (const int n_tokens) {
7823
+ void ggml_cuda_set_peer_access (const int n_tokens) {
7824
7824
static bool peer_access_enabled = false ;
7825
7825
7826
7826
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
@@ -7881,8 +7881,6 @@ static void ggml_cuda_op_mul_mat(
7881
7881
const int nb2 = dst->nb [2 ];
7882
7882
const int nb3 = dst->nb [3 ];
7883
7883
7884
- ggml_cuda_set_peer_access (ne11);
7885
-
7886
7884
GGML_ASSERT (dst->backend != GGML_BACKEND_GPU_SPLIT);
7887
7885
GGML_ASSERT (src1->backend != GGML_BACKEND_GPU_SPLIT);
7888
7886
@@ -8781,16 +8779,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8781
8779
8782
8780
GGML_ASSERT (dst->backend == GGML_BACKEND_GPU);
8783
8781
8782
+ const int64_t nb11 = src1->nb [1 ];
8783
+ const int64_t nb1 = dst->nb [1 ];
8784
+
8784
8785
const struct ggml_tensor * ids = src0;
8785
8786
const int32_t id = ((int32_t *) dst->op_params )[0 ];
8786
8787
const int32_t n_as = ((int32_t *) dst->op_params )[1 ];
8787
8788
8788
8789
std::vector<char > ids_host (ggml_nbytes (ids));
8789
8790
8791
+ const cudaStream_t stream = g_cudaStreams[g_main_device][0 ];
8792
+
8790
8793
if (ids->backend == GGML_BACKEND_GPU) {
8791
8794
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra )->data_device [g_main_device];
8792
- CUDA_CHECK (cudaMemcpyAsync (ids_host.data (), ids_dev, ggml_nbytes (ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][ 0 ] ));
8793
- CUDA_CHECK (cudaStreamSynchronize (g_cudaStreams[g_main_device][ 0 ] ));
8795
+ CUDA_CHECK (cudaMemcpyAsync (ids_host.data (), ids_dev, ggml_nbytes (ids), cudaMemcpyDeviceToHost, stream ));
8796
+ CUDA_CHECK (cudaStreamSynchronize (stream ));
8794
8797
} else {
8795
8798
memcpy (ids_host.data (), ids->data , ggml_nbytes (ids));
8796
8799
}
@@ -8804,37 +8807,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8804
8807
ggml_tensor src1_row = *src1;
8805
8808
ggml_tensor dst_row = *dst;
8806
8809
8807
- src1_row.ne [ 1 ] = 1 ;
8808
- dst_row.ne [ 1 ] = 1 ;
8810
+ src1_row.extra = &src1_row_extra ;
8811
+ dst_row.extra = &dst_row_extra ;
8809
8812
8810
- src1_row. nb [ 2 ] = src1_row. nb [ 1 ];
8811
- dst_row. nb [ 2 ] = dst_row. nb [ 1 ];
8813
+ char * src1_original = ( char *) src1_extra-> data_device [g_main_device ];
8814
+ char * dst_original = ( char *) dst_extra-> data_device [g_main_device ];
8812
8815
8813
- src1_row.nb [3 ] = src1_row.nb [1 ];
8814
- dst_row.nb [3 ] = dst_row.nb [1 ];
8816
+ if (src1->ne [1 ] == 1 ) {
8817
+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8818
+ // int32_t row_id;
8819
+ // CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8820
+ // CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8815
8821
8816
- src1_row.extra = &src1_row_extra;
8817
- dst_row.extra = &dst_row_extra;
8822
+ const int32_t row_id = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8823
+
8824
+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
8825
+
8826
+ const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8827
+
8828
+ src1_row_extra.data_device [g_main_device] = src1_original + i01*src1->nb [1 ];
8829
+ src1_row.data = (char *) src1->data + i01*src1->nb [1 ]; // TODO why is this set?
8830
+
8831
+ dst_row_extra.data_device [g_main_device] = dst_original + i01*dst->nb [1 ];
8832
+ dst_row.data = (char *) dst->data + i01*dst->nb [1 ]; // TODO why is this set?
8833
+
8834
+ ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8835
+ }
8836
+ } else {
8837
+ size_t as_src1, as_dst;
8838
+ char * src1_contiguous = (char *) ggml_cuda_pool_malloc (sizeof (float )*ggml_nelements (src1), &as_src1);
8839
+ char * dst_contiguous = (char *) ggml_cuda_pool_malloc (sizeof (float )*ggml_nelements (dst), &as_dst);
8818
8840
8841
+ src1_row_extra.data_device [g_main_device] = src1_contiguous;
8842
+ dst_row_extra.data_device [g_main_device] = dst_contiguous;
8819
8843
8820
- for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8821
- // int32_t row_id;
8822
- // CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8823
- // CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8844
+ for (int32_t row_id = 0 ; row_id < 8 ; ++row_id) {
8845
+ const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8824
8846
8825
- const int32_t row_id = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8847
+ int64_t num_src1_rows = 0 ;
8848
+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8849
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8826
8850
8827
- GGML_ASSERT (row_id >= 0 && row_id < n_as);
8851
+ if (row_id_i != row_id) {
8852
+ continue ;
8853
+ }
8854
+
8855
+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
8856
+
8857
+ CUDA_CHECK (cudaMemcpyAsync (src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8858
+ nb11, cudaMemcpyDeviceToDevice, stream));
8859
+ num_src1_rows++;
8860
+ }
8861
+
8862
+ if (num_src1_rows == 0 ) {
8863
+ continue ;
8864
+ }
8865
+
8866
+ src1_row.ne [1 ] = num_src1_rows;
8867
+ dst_row.ne [1 ] = num_src1_rows;
8828
8868
8829
- const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8869
+ src1_row.nb [1 ] = nb11;
8870
+ src1_row.nb [2 ] = num_src1_rows*nb11;
8871
+ src1_row.nb [3 ] = num_src1_rows*nb11;
8830
8872
8831
- src1_row_extra.data_device [g_main_device] = (char *) src1_extra->data_device [g_main_device] + i01*src1->nb [1 ];
8832
- src1_row.data = (char *) src1->data + i01*src1->nb [1 ];
8873
+ dst_row.nb [1 ] = nb1;
8874
+ dst_row.nb [2 ] = num_src1_rows*nb1;
8875
+ dst_row.nb [3 ] = num_src1_rows*nb1;
8833
8876
8834
- dst_row_extra.data_device [g_main_device] = (char *) dst_extra->data_device [g_main_device] + i01*dst->nb [1 ];
8835
- dst_row.data = (char *) dst->data + i01*dst->nb [1 ];
8877
+ ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8878
+
8879
+ num_src1_rows = 0 ;
8880
+ for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8881
+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + i01*ids->nb [1 ] + id*ids->nb [0 ]);
8882
+
8883
+ if (row_id_i != row_id) {
8884
+ continue ;
8885
+ }
8886
+
8887
+ GGML_ASSERT (row_id >= 0 && row_id < n_as);
8888
+
8889
+ CUDA_CHECK (cudaMemcpyAsync (dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8890
+ nb1, cudaMemcpyDeviceToDevice, stream));
8891
+ num_src1_rows++;
8892
+ }
8893
+ }
8836
8894
8837
- ggml_cuda_mul_mat (src0_row, &src1_row, &dst_row);
8895
+ ggml_cuda_pool_free (src1_contiguous, as_src1);
8896
+ ggml_cuda_pool_free (dst_contiguous, as_dst);
8838
8897
}
8839
8898
}
8840
8899
0 commit comments