@@ -5891,6 +5891,325 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
58915891 }
58925892}
58935893
5894+ static void ggml_cuda_op_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
5895+ ggml_cuda_op_t op, bool src0_needs_f32) {
5896+ const int64_t ne00 = src0->ne [0 ];
5897+ const int64_t ne01 = src0->ne [1 ];
5898+ const int64_t ne02 = src0->ne [2 ];
5899+ const int64_t ne03 = src0->ne [3 ];
5900+ const int64_t nrows0 = ggml_nrows (src0);
5901+
5902+ const bool use_src1 = src1 != nullptr ;
5903+ const int64_t ne10 = use_src1 ? src1->ne [0 ] : 1 ;
5904+ const int64_t ne11 = use_src1 ? src1->ne [1 ] : 1 ;
5905+ const int64_t ne12 = use_src1 ? src1->ne [2 ] : 1 ;
5906+ const int64_t ne13 = use_src1 ? src1->ne [3 ] : 1 ;
5907+ const int64_t nrows1 = use_src1 ? ggml_nrows (src1) : 1 ;
5908+
5909+ GGML_ASSERT (ne03 == ne13);
5910+
5911+ const int64_t ne0 = dst->ne [0 ];
5912+ const int64_t ne1 = dst->ne [1 ];
5913+
5914+ const int nb2 = dst->nb [2 ];
5915+ const int nb3 = dst->nb [3 ];
5916+
5917+ GGML_ASSERT (dst->backend != GGML_BACKEND_GPU_SPLIT);
5918+ GGML_ASSERT (!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
5919+
5920+ // strides for iteration over dims 3 and 2
5921+ const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13;
5922+ const int64_t num_iters = num_iters_0;
5923+ const int64_t stride_mod = 1 ;
5924+ const int64_t src0_stride = ne00 * ne01 * stride_mod;
5925+ const int64_t src1_stride = ne10 * ne11 * stride_mod;
5926+ const int64_t dst_stride = ne0 * ne1 * stride_mod;
5927+
5928+ const int64_t rows_per_iter = ne01;
5929+ const int64_t i03_max = ne03;
5930+ const int64_t i02_max = (ne02 >= ne12 ? ne02 : ne12);
5931+ const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02;
5932+
5933+ const size_t src0_ts = ggml_type_size (src0->type );
5934+ const size_t src0_bs = ggml_blck_size (src0->type );
5935+
5936+ struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra ;
5937+ struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr ;
5938+ struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra ;
5939+
5940+ const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
5941+ const bool src0_is_contiguous = ggml_is_contiguous (src0);
5942+ const bool src0_is_f32 = src0->type == GGML_TYPE_F32;
5943+
5944+ const bool src1_is_contiguous = use_src1 && ggml_is_contiguous (src1);
5945+ const bool src1_stays_on_host = use_src1 && (
5946+ dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE);
5947+
5948+ const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
5949+ GGML_ASSERT (!(split && ne02 < ne12));
5950+
5951+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (src0->type );
5952+
5953+ // dd = data device
5954+ char * src0_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr }; // quantized
5955+ float * src0_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr }; // float
5956+ float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr };
5957+ float * dst_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr };
5958+
5959+ // asq = actual size quantized, asf = actual size float
5960+ size_t src0_asq[GGML_CUDA_MAX_DEVICES] = {0 };
5961+ size_t src0_asf[GGML_CUDA_MAX_DEVICES] = {0 };
5962+ size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0 };
5963+ size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0 };
5964+
5965+ // if multiple devices are used they need to wait for the main device
5966+ // here an event is recorded that signifies that the main device has finished calculating the input data
5967+ if (split && g_device_count > 1 ) {
5968+ CUDA_CHECK (cudaSetDevice (g_main_device));
5969+ CUDA_CHECK (cudaEventRecord (src0_extra->events [g_main_device], g_cudaStreams_main[g_main_device]));
5970+ }
5971+
5972+ for (int id = 0 ; id < g_device_count; ++id) {
5973+ if (!split && id != g_main_device) {
5974+ continue ;
5975+ }
5976+
5977+ const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU && id == g_main_device;
5978+ const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
5979+
5980+ int64_t row_low, row_high;
5981+ if (split) {
5982+ const int64_t rounding = get_row_rounding (src0->type );
5983+
5984+ row_low = id == 0 ? 0 : nrows0*g_tensor_split[id];
5985+ row_low -= row_low % rounding;
5986+
5987+ if (id == g_device_count - 1 ) {
5988+ row_high = nrows0;
5989+ } else {
5990+ row_high = nrows0*g_tensor_split[id + 1 ];
5991+ row_high -= row_high % rounding;
5992+ }
5993+ } else {
5994+ row_low = 0 ;
5995+ row_high = nrows0*i02_divisor;
5996+ }
5997+ if (row_low == row_high) {
5998+ continue ;
5999+ }
6000+
6001+ int64_t row_diff = row_high - row_low;
6002+
6003+ cudaSetDevice (id);
6004+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
6005+
6006+ // wait for main GPU data if necessary
6007+ if (split && id != g_main_device) {
6008+ CUDA_CHECK (cudaStreamWaitEvent (cudaStream_main, src0_extra->events [g_main_device]));
6009+ }
6010+
6011+ if (src0_on_device && src0_is_contiguous) {
6012+ if (src0_is_f32) {
6013+ src0_ddf[id] = (float *) src0_extra->data_device [id];
6014+ } else {
6015+ src0_ddq[id] = (char *) src0_extra->data_device [id];
6016+ }
6017+ } else {
6018+ if (src0_is_f32) {
6019+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_asf[id]);
6020+ } else {
6021+ src0_ddq[id] = (char *) ggml_cuda_pool_malloc (row_diff*ne00 * src0_ts/src0_bs, &src0_asq[id]);
6022+ }
6023+ }
6024+
6025+ if (src0_needs_f32 && !src0_is_f32) {
6026+ src0_ddf[id] = (float *) ggml_cuda_pool_malloc (row_diff*ne00 * sizeof (float ), &src0_asf[id]);
6027+ }
6028+
6029+ if (use_src1 && !src1_stays_on_host) {
6030+ if (src1_on_device && src1_is_contiguous) {
6031+ src1_ddf[id] = (float *) src1_extra->data_device [id];
6032+ } else {
6033+ src1_ddf[id] = (float *) ggml_cuda_pool_malloc (num_iters*src1_stride * sizeof (float ), &src1_asf[id]);
6034+ }
6035+ }
6036+ if (dst_on_device) {
6037+ dst_ddf[id] = (float *) dst_extra->data_device [id];
6038+ } else {
6039+ size_t size_dst_ddf = split ? row_diff*ne1 * sizeof (float ) : num_iters*dst_stride * sizeof (float );
6040+ dst_ddf[id] = (float *) ggml_cuda_pool_malloc (size_dst_ddf, &dst_asf[id]);
6041+ }
6042+
6043+ for (int64_t i03 = 0 ; i03 < i03_max; i03++) {
6044+ const int64_t i13 = i03 % ne13;
6045+ for (int64_t i02 = 0 ; i02 < i02_max; i02++) {
6046+ const int64_t i12 = i02 % ne12;
6047+
6048+ const int64_t i0 = i03*i02_max + i02;
6049+
6050+ // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs
6051+ const int64_t i0_offset_low = row_low/rows_per_iter;
6052+ const int64_t i0_offset_high = row_high/rows_per_iter;
6053+
6054+ int64_t i01_low = 0 ;
6055+ int64_t i01_high = rows_per_iter;
6056+ if (split) {
6057+ if (i0 < i0_offset_low || i0 > i0_offset_high) {
6058+ continue ;
6059+ }
6060+ if (i0 == i0_offset_low) {
6061+ i01_low = row_low % rows_per_iter;
6062+ }
6063+ if (i0 == i0_offset_high) {
6064+ i01_high = row_high % rows_per_iter;
6065+ }
6066+ }
6067+
6068+ // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
6069+ // Removing the first assert or changing the order of the arguments causes the second assert to fail.
6070+ // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
6071+ // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
6072+ GGML_ASSERT (i01_low == 0 || g_device_count > 1 );
6073+ GGML_ASSERT (i01_high == rows_per_iter || g_device_count > 1 );
6074+
6075+ const int64_t i01_diff = i01_high - i01_low;
6076+ if (i01_diff == 0 ) {
6077+ continue ;
6078+ }
6079+ const int64_t i11 = i13*ne12 + i12;
6080+
6081+ // for split tensors the data begins at i0 == i0_offset_low
6082+ char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs;
6083+ float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride;
6084+ float * src1_ddf_i = src1_ddf[id] + i11*src1_stride;
6085+ float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride;
6086+
6087+ // for split tensors the data pointer needs to be rounded down
6088+ // to the bin edge for i03, i02 bins beyond the first
6089+ if (i0 - i0_offset_low > 0 ) {
6090+ src0_ddq_i -= (row_low % ne01)*ne00 * src0_ts/src0_bs;
6091+ src0_ddf_i -= (row_low % ne01)*ne00;
6092+ dst_ddf_i -= (row_low % ne0)*ne1;
6093+ }
6094+
6095+ // the main device memory buffer can be on VRAM scratch, with space for all partial results
6096+ // in that case an offset on dst_ddf_i is needed
6097+ if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
6098+ dst_ddf_i += i01_low; // offset is 0 if no tensor split
6099+ }
6100+
6101+ // copy src0, src1 to device if necessary
6102+ if (use_src1 && !src1_stays_on_host) {
6103+ if (src1->backend == GGML_BACKEND_CPU) {
6104+ int64_t nrows1 = ne11;
6105+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , nrows1, cudaStream_main));
6106+ } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
6107+ if (id != g_main_device) {
6108+ float * src1_ddf_i_source = (float *) src1_extra->data_device [g_main_device];
6109+ src1_ddf_i_source += i11*src1_stride;
6110+ CUDA_CHECK (cudaMemcpyAsync (src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof (float ),
6111+ cudaMemcpyDeviceToDevice, cudaStream_main));
6112+ }
6113+ } else if (src1_on_device && !src1_is_contiguous) {
6114+ GGML_ASSERT (!split);
6115+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src1_ddf_i, src1, i03, i02, 0 , ne11, cudaStream_main));
6116+ } else {
6117+ GGML_ASSERT (false );
6118+ }
6119+ }
6120+
6121+ if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0 ) {
6122+ if (src0_is_f32) {
6123+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6124+ } else {
6125+ CUDA_CHECK (ggml_cuda_cpy_tensor_2d (src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main));
6126+ }
6127+ }
6128+
6129+ // convert src0 to f32 if it is necessary for the ggml_cuda_op
6130+ if (src0_needs_f32 && !src0_is_f32) {
6131+ to_fp32_cuda (src0_ddq_i, src0_ddf_i, i01_diff*ne00, cudaStream_main);
6132+ CUDA_CHECK (cudaGetLastError ());
6133+ }
6134+
6135+ // do the computation
6136+ op (src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
6137+ CUDA_CHECK (cudaGetLastError ());
6138+
6139+ // copy dst to host or other device if necessary
6140+ if (!dst_on_device) {
6141+ void * dst_off_device;
6142+ cudaMemcpyKind kind;
6143+ if (dst->backend == GGML_BACKEND_CPU) {
6144+ dst_off_device = dst->data ;
6145+ kind = cudaMemcpyDeviceToHost;
6146+ } else if (dst->backend == GGML_BACKEND_GPU) {
6147+ dst_off_device = dst_extra->data_device [g_main_device];
6148+ kind = cudaMemcpyDeviceToDevice;
6149+ } else {
6150+ GGML_ASSERT (false );
6151+ }
6152+ if (split) {
6153+ // src0 = weight matrix is saved as a transposed matrix for better memory layout.
6154+ // dst is NOT transposed.
6155+ // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
6156+ // Instead they need to be copied to the correct slice in ne0 = dst row index.
6157+ // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
6158+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i01_low*sizeof (float ) + i02*nb2 + i03*nb3);
6159+ CUDA_CHECK (cudaMemcpy2DAsync (dhf_dst_i, ne0*sizeof (float ), dst_ddf_i, i01_diff*sizeof (float ),
6160+ i01_diff*sizeof (float ), ne1, kind, cudaStream_main));
6161+ } else {
6162+ float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
6163+ CUDA_CHECK (cudaMemcpyAsync (dhf_dst_i, dst_ddf_i, dst_stride*sizeof (float ), kind, cudaStream_main));
6164+ }
6165+ }
6166+
6167+ // signify to main device that other device is done
6168+ if (split && g_device_count > 1 && id != g_main_device) {
6169+ CUDA_CHECK (cudaEventRecord (src0_extra->events [id], cudaStream_main));
6170+ }
6171+ }
6172+ }
6173+ }
6174+
6175+ // wait until each device is finished, then free their buffers
6176+ for (int id = 0 ; id < g_device_count; ++id) {
6177+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0 ) {
6178+ continue ;
6179+ }
6180+
6181+ CUDA_CHECK (cudaSetDevice (id));
6182+
6183+ if (src0_asq[id] > 0 ) {
6184+ ggml_cuda_pool_free (src0_ddq[id], src0_asq[id]);
6185+ }
6186+ if (src0_asf[id] > 0 ) {
6187+ ggml_cuda_pool_free (src0_ddf[id], src0_asf[id]);
6188+ }
6189+ if (src1_asf[id] > 0 ) {
6190+ ggml_cuda_pool_free (src1_ddf[id], src1_asf[id]);
6191+ }
6192+ if (dst_asf[id] > 0 ) {
6193+ ggml_cuda_pool_free (dst_ddf[id], dst_asf[id]);
6194+ }
6195+ }
6196+
6197+ // main device waits for all other devices to be finished
6198+ if (split && g_device_count > 1 ) {
6199+ CUDA_CHECK (cudaSetDevice (g_main_device));
6200+ for (int id = 0 ; id < g_device_count; ++id) {
6201+ if (id != g_main_device && src0_extra->events [id]) {
6202+ CUDA_CHECK (cudaStreamWaitEvent (g_cudaStreams_main[g_main_device], src0_extra->events [id]));
6203+ }
6204+ }
6205+ }
6206+
6207+ if (dst->backend == GGML_BACKEND_CPU) {
6208+ CUDA_CHECK (cudaSetDevice (g_main_device));
6209+ CUDA_CHECK (cudaDeviceSynchronize ());
6210+ }
6211+ }
6212+
58946213static void ggml_cuda_op (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
58956214 ggml_cuda_op_t op, bool src0_needs_f32, bool flatten_rows) {
58966215 const int64_t ne00 = src0->ne [0 ];
@@ -6327,10 +6646,10 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
63276646 } else if (all_on_device && !ggml_is_contiguous (src0) && ggml_is_contiguous (src1) && src1->ne [1 ] == 1 ) {
63286647 ggml_cuda_mul_mat_vec_nc (src0, src1, dst);
63296648 }else if (src0->type == GGML_TYPE_F32) {
6330- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true , false );
6649+ ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true );
63316650 } else if (ggml_is_quantized (src0->type ) || src0->type == GGML_TYPE_F16) {
63326651 if (src1->ne [1 ] == 1 && src0->ne [0 ] % GGML_CUDA_DMMV_X == 0 ) {
6333- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_vec, false , false );
6652+ ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_vec, false );
63346653 } else {
63356654 int min_compute_capability = INT_MAX;
63366655 for (int id = 0 ; id < g_device_count; ++id) {
@@ -6341,9 +6660,9 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
63416660 }
63426661
63436662 if (g_mul_mat_q && ggml_is_quantized (src0->type ) && min_compute_capability >= MIN_CC_DP4A) {
6344- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_q, false , false );
6663+ ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_q, false );
63456664 } else {
6346- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true , false );
6665+ ggml_cuda_op_mul_mat (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true );
63476666 }
63486667 }
63496668 } else {
0 commit comments