Skip to content

Commit 6a72c7f

Browse files
committed
Merge remote-tracking branch 'origin/master' into sl/ggml-backend-int
2 parents a74b1a8 + 1d7a191 commit 6a72c7f

File tree

2 files changed

+94
-26
lines changed

2 files changed

+94
-26
lines changed

ggml-cuda.cu

Lines changed: 93 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7830,6 +7830,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
78307830
}
78317831

78327832
#ifdef NDEBUG
7833+
for (int id = 0; id < g_device_count; ++id) {
7834+
CUDA_CHECK(ggml_cuda_set_device(id));
7835+
CUDA_CHECK(cudaDeviceSynchronize());
7836+
}
7837+
78337838
for (int id = 0; id < g_device_count; ++id) {
78347839
CUDA_CHECK(ggml_cuda_set_device(id));
78357840

@@ -7881,8 +7886,6 @@ static void ggml_cuda_op_mul_mat(
78817886
const int nb2 = dst->nb[2];
78827887
const int nb3 = dst->nb[3];
78837888

7884-
ggml_cuda_set_peer_access(ne11);
7885-
78867889
GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
78877890
GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
78887891

@@ -8781,16 +8784,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
87818784

87828785
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
87838786

8787+
const int64_t nb11 = src1->nb[1];
8788+
const int64_t nb1 = dst->nb[1];
8789+
87848790
const struct ggml_tensor * ids = src0;
87858791
const int32_t id = ((int32_t *) dst->op_params)[0];
87868792
const int32_t n_as = ((int32_t *) dst->op_params)[1];
87878793

87888794
std::vector<char> ids_host(ggml_nbytes(ids));
87898795

8796+
const cudaStream_t stream = g_cudaStreams[g_main_device][0];
8797+
87908798
if (ids->backend == GGML_BACKEND_GPU) {
87918799
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
8792-
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8793-
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8800+
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
8801+
CUDA_CHECK(cudaStreamSynchronize(stream));
87948802
} else {
87958803
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
87968804
}
@@ -8804,37 +8812,93 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
88048812
ggml_tensor src1_row = *src1;
88058813
ggml_tensor dst_row = *dst;
88068814

8807-
src1_row.ne[1] = 1;
8808-
dst_row.ne[1] = 1;
8815+
src1_row.extra = &src1_row_extra;
8816+
dst_row.extra = &dst_row_extra;
88098817

8810-
src1_row.nb[2] = src1_row.nb[1];
8811-
dst_row.nb[2] = dst_row.nb[1];
8818+
char * src1_original = (char *) src1_extra->data_device[g_main_device];
8819+
char * dst_original = (char *) dst_extra->data_device[g_main_device];
88128820

8813-
src1_row.nb[3] = src1_row.nb[1];
8814-
dst_row.nb[3] = dst_row.nb[1];
8821+
if (src1->ne[1] == 1) {
8822+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8823+
//int32_t row_id;
8824+
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8825+
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
88158826

8816-
src1_row.extra = &src1_row_extra;
8817-
dst_row.extra = &dst_row_extra;
8827+
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
88188828

8829+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
88198830

8820-
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8821-
//int32_t row_id;
8822-
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
8823-
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
8831+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
88248832

8825-
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8833+
src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
8834+
src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
88268835

8827-
GGML_ASSERT(row_id >= 0 && row_id < n_as);
8836+
dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
8837+
dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
88288838

8829-
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8839+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8840+
}
8841+
} else {
8842+
size_t as_src1, as_dst;
8843+
char * src1_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(src1), &as_src1);
8844+
char * dst_contiguous = (char *) ggml_cuda_pool_malloc(sizeof(float)*ggml_nelements(dst), &as_dst);
88308845

8831-
src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
8832-
src1_row.data = (char *) src1->data + i01*src1->nb[1];
8846+
src1_row_extra.data_device[g_main_device] = src1_contiguous;
8847+
dst_row_extra.data_device[g_main_device] = dst_contiguous;
8848+
8849+
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
8850+
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
8851+
8852+
int64_t num_src1_rows = 0;
8853+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8854+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8855+
8856+
if (row_id_i != row_id) {
8857+
continue;
8858+
}
88338859

8834-
dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
8835-
dst_row.data = (char *) dst->data + i01*dst->nb[1];
8860+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
88368861

8837-
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8862+
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8863+
nb11, cudaMemcpyDeviceToDevice, stream));
8864+
num_src1_rows++;
8865+
}
8866+
8867+
if (num_src1_rows == 0) {
8868+
continue;
8869+
}
8870+
8871+
src1_row.ne[1] = num_src1_rows;
8872+
dst_row.ne[1] = num_src1_rows;
8873+
8874+
src1_row.nb[1] = nb11;
8875+
src1_row.nb[2] = num_src1_rows*nb11;
8876+
src1_row.nb[3] = num_src1_rows*nb11;
8877+
8878+
dst_row.nb[1] = nb1;
8879+
dst_row.nb[2] = num_src1_rows*nb1;
8880+
dst_row.nb[3] = num_src1_rows*nb1;
8881+
8882+
ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
8883+
8884+
num_src1_rows = 0;
8885+
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
8886+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
8887+
8888+
if (row_id_i != row_id) {
8889+
continue;
8890+
}
8891+
8892+
GGML_ASSERT(row_id >= 0 && row_id < n_as);
8893+
8894+
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8895+
nb1, cudaMemcpyDeviceToDevice, stream));
8896+
num_src1_rows++;
8897+
}
8898+
}
8899+
8900+
ggml_cuda_pool_free(src1_contiguous, as_src1);
8901+
ggml_cuda_pool_free(dst_contiguous, as_dst);
88388902
}
88398903
}
88408904

@@ -9027,7 +9091,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
90279091
}
90289092

90299093
void ggml_cuda_free_data(struct ggml_tensor * tensor) {
9030-
if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9094+
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
90319095
return;
90329096
}
90339097

@@ -9369,6 +9433,10 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
93699433
return false;
93709434
}
93719435

9436+
if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
9437+
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
9438+
}
9439+
93729440
if (params->ith != 0) {
93739441
return true;
93749442
}

ggml.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ extern "C" {
303303

304304
#if defined(__ARM_NEON) && defined(__CUDACC__)
305305
typedef half ggml_fp16_t;
306-
#elif defined(__ARM_NEON)
306+
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
307307
typedef __fp16 ggml_fp16_t;
308308
#else
309309
typedef uint16_t ggml_fp16_t;

0 commit comments

Comments
 (0)