Skip to content

Commit 5885b52

Browse files
CUDA: make MoE tensors contiguous for batch size>1
1 parent 328b83d commit 5885b52

File tree

3 files changed

+87
-25
lines changed

3 files changed

+87
-25
lines changed

ggml-cuda.cu

Lines changed: 84 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7820,7 +7820,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
78207820
}
78217821
}
78227822

7823-
static void ggml_cuda_set_peer_access(const int n_tokens) {
7823+
void ggml_cuda_set_peer_access(const int n_tokens) {
78247824
static bool peer_access_enabled = false;
78257825

78267826
const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
@@ -7881,8 +7881,6 @@ static void ggml_cuda_op_mul_mat(
78817881
const int nb2 = dst->nb[2];
78827882
const int nb3 = dst->nb[3];
78837883

7884-
ggml_cuda_set_peer_access(ne11);
7885-
78867884
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
78877885
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
78887886

@@ -8781,16 +8779,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
87818779

87828780
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
87838781

8782+
const int64_t nb11 = src1->nb[1];
8783+
const int64_t nb1 = dst->nb[1];
8784+
87848785
const struct ggml_tensor * ids = src0;
87858786
const int32_t id = ((int32_t *) dst->op_params)[0];
87868787
const int32_t n_as = ((int32_t *) dst->op_params)[1];
87878788

87888789
std::vector<char> ids_host(ggml_nbytes(ids));
87898790

8791+
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
8792+
87908793
if (ids->backend == GGML_BACKEND_GPU) {
87918794
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8792-
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8793-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8795+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
8796+
CUDA_CHECK(cudaStreamSynchronize(stream));
87948797
} else {
87958798
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
87968799
}
@@ -8804,37 +8807,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
88048807
ggml_tensor src1_row = *src1;
88058808
ggml_tensor dst_row = *dst;
88068809

8807-
src1_row.ne[1] = 1;
8808-
dst_row.ne[1] = 1;
8810+
src1_row.extra = &src1_row_extra;
8811+
dst_row.extra = &dst_row_extra;
88098812

8810-
src1_row.nb[2] = src1_row.nb[1];
8811-
dst_row.nb[2] = dst_row.nb[1];
8813+
char * src1_original = (char *) src1_extra->data_device[g_main_device];
8814+
char * dst_original = (char *) dst_extra->data_device[g_main_device];
88128815

8813-
src1_row.nb[3] = src1_row.nb[1];
8814-
dst_row.nb[3] = dst_row.nb[1];
8816+
if (src1->ne[1] == 1) {
8817+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8818+
//int32_t row_id;
8819+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8820+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
88158821

8816-
src1_row.extra = &src1_row_extra;
8817-
dst_row.extra = &dst_row_extra;
8822+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8823+
8824+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
8825+
8826+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8827+
8828+
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
8829+
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
8830+
8831+
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
8832+
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
8833+
8834+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8835+
}
8836+
} else {
8837+
size_t as_src1, as_dst;
8838+
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
8839+
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
88188840

8841+
src1_row_extra.data_device[g_main_device] = src1_contiguous;
8842+
dst_row_extra.data_device[g_main_device] = dst_contiguous;
88198843

8820-
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8821-
//int32_t row_id;
8822-
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8823-
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8844+
for (int32_t row_id = 0; row_id < 8; ++row_id) {
8845+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
88248846

8825-
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8847+
int64_t num_src1_rows = 0;
8848+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8849+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
88268850

8827-
GGML_ASSERT(row_id >= 0 && row_id < n_as);
8851+
if (row_id_i != row_id) {
8852+
continue;
8853+
}
8854+
8855+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
8856+
8857+
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8858+
nb11, cudaMemcpyDeviceToDevice, stream));
8859+
num_src1_rows++;
8860+
}
8861+
8862+
if (num_src1_rows == 0) {
8863+
continue;
8864+
}
8865+
8866+
src1_row.ne[1] = num_src1_rows;
8867+
dst_row.ne[1] = num_src1_rows;
88288868

8829-
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8869+
src1_row.nb[1] = nb11;
8870+
src1_row.nb[2] = num_src1_rows*nb11;
8871+
src1_row.nb[3] = num_src1_rows*nb11;
88308872

8831-
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
8832-
src1_row.data = (char *) src1->data + i01*src1->nb[1];
8873+
dst_row.nb[1] = nb1;
8874+
dst_row.nb[2] = num_src1_rows*nb1;
8875+
dst_row.nb[3] = num_src1_rows*nb1;
88338876

8834-
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
8835-
dst_row.data = (char *) dst->data + i01*dst->nb[1];
8877+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8878+
8879+
num_src1_rows = 0;
8880+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8881+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8882+
8883+
if (row_id_i != row_id) {
8884+
continue;
8885+
}
8886+
8887+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
8888+
8889+
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8890+
nb1, cudaMemcpyDeviceToDevice, stream));
8891+
num_src1_rows++;
8892+
}
8893+
}
88368894

8837-
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8895+
ggml_cuda_pool_free(src1_contiguous, as_src1);
8896+
ggml_cuda_pool_free(dst_contiguous, as_dst);
88388897
}
88398898
}
88408899

ggml-cuda.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s
4747

4848
GGML_API int ggml_cuda_get_device_count(void);
4949
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
50+
GGML_API void ggml_cuda_set_peer_access(int n_tokens);
5051

5152
// backend API
5253
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);

llama.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6326,6 +6326,8 @@ static int llama_decode_internal(
63266326
embeddings->backend = GGML_BACKEND_CPU;
63276327
}
63286328
res->backend = GGML_BACKEND_CPU;
6329+
6330+
ggml_cuda_set_peer_access(n_tokens);
63296331
#endif
63306332

63316333
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);

0 commit comments

Comments
 (0)