-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Open
Description
Hello, Mr. @ggerganov , thank you for awesome project.
ggml_cuda_op_mul_mat() {
...
// copy dst to host or other device if necessary
if (!dst_on_device) {
void * dst_off_device;
cudaMemcpyKind kind;
if (dst->backend == GGML_BACKEND_CPU) {
dst_off_device = dst->data;
kind = cudaMemcpyDeviceToHost;
} else if (dst->backend == GGML_BACKEND_GPU) {
dst_off_device = dst_extra->data_device[g_main_device];
kind = cudaMemcpyDeviceToDevice;
} else {
GGML_ASSERT(false);
}
if (split) {
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
// dst is NOT transposed.
// The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
// Instead they need to be copied to the correct slice in ne0 = dst row index.
// If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0 + row_low[id];
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), dst_dd_i, row_diff*sizeof(float),
row_diff*sizeof(float), src1_ncols, kind, stream));
} else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0;
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
}
}
// add event for the main device to wait on until other device is done
if (split && (id != g_main_device || is != 0)) {
CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream));
}
}
...
}
I'm trying to understand how this code should work.
What I clarify atm, is:
- when split is true, than call to cudaMemcpy2DAsync() resulting in SIGSEGV, because of nullptr deref deeply in libhipamd64.so.
- the total src size (WxH) would be half from
src1_ncols*ne0*sizeof(float), when running on two GPUs. - the dst pointer, mem, memsize, and mem data (checked under rocgdb in runtime) seems to be ok.
- the src mem + (WxH - 1) points to last usable byte of weights matrix, which seems to be ok too.
where WxH isrow_diff*sizeof(float)*src1_ncols - if change 2D to classic async memcpy -- the crash is gone, but the results is wrong.
What I asking to hint, or explain me, please:
- How and why dpitch and spitch should be formed
- why src pitch is so large?
- why src height is so small?
- if 2 and 3 perform un-transposing of result?
- why does it crash?
- WxH is okey, but WxH*pitch is oob?
Thank you 💯
[in] | width | Width of matrix transfer (columns in bytes)
[in] | height | Height of matrix transfer (rows)
Metadata
Metadata
Assignees
Labels
No labels