Skip to content

Commit 52fcea9

Browse files
slarenhodlen
authored andcommitted
cuda : add LLAMA_CUDA_NO_PEER_COPY to workaround broken ROCm p2p copy (ggml-org#6208)
* cuda : add LLAMA_CUDA_NO_PEER_COPY to workaround broken ROCm p2p copy * add LLAMA_CUDA_NO_PEER_COPY to HIP build
1 parent 38a0cb6 commit 52fcea9

File tree

3 files changed

+31
-8
lines changed

3 files changed

+31
-8
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
9999
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
100100
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
101101
"llama: max. batch size for using peer access")
102+
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
102103
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
103104
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
104105
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -387,6 +388,9 @@ if (LLAMA_CUBLAS)
387388
endif()
388389
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
389390
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
391+
if (LLAMA_CUDA_NO_PEER_COPY)
392+
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
393+
endif()
390394

391395
if (LLAMA_STATIC)
392396
if (WIN32)
@@ -531,6 +535,10 @@ if (LLAMA_HIPBLAS)
531535
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
532536
endif()
533537

538+
if (LLAMA_CUDA_NO_PEER_COPY)
539+
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
540+
endif()
541+
534542
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
535543
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
536544
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

Makefile

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -452,9 +452,9 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
452452
else
453453
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
454454
endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
455-
#ifdef LLAMA_CUDA_CUBLAS
456-
# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
457-
#endif # LLAMA_CUDA_CUBLAS
455+
ifdef LLAMA_CUDA_NO_PEER_COPY
456+
MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
457+
endif # LLAMA_CUDA_NO_PEER_COPY
458458
ifdef LLAMA_CUDA_CCBIN
459459
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
460460
endif
@@ -535,6 +535,9 @@ endif # LLAMA_HIP_UMA
535535
ifdef LLAMA_CUDA_FORCE_DMMV
536536
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
537537
endif # LLAMA_CUDA_FORCE_DMMV
538+
ifdef LLAMA_CUDA_NO_PEER_COPY
539+
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
540+
endif # LLAMA_CUDA_NO_PEER_COPY
538541
OBJS += ggml-cuda.o
539542
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
540543
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<

ggml-cuda.cu

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,11 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t
771771
if (src_ctx->device == dst_ctx->device) {
772772
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread));
773773
} else {
774+
#ifdef GGML_CUDA_NO_PEER_COPY
775+
return false;
776+
#else
774777
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread));
778+
#endif
775779
}
776780
CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
777781
return true;
@@ -11322,19 +11326,23 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_
1132211326
GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device);
1132311327
GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device);
1132411328

11325-
if (!cuda_ctx_src->copy_event) {
11326-
ggml_cuda_set_device(cuda_ctx_src->device);
11327-
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
11328-
}
11329-
1133011329
// copy on src stream
1133111330
if (cuda_ctx_src->device == cuda_ctx_dst->device) {
1133211331
CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream()));
1133311332
} else {
11333+
#ifdef GGML_CUDA_NO_PEER_COPY
11334+
return false;
11335+
#else
1133411336
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
11337+
#endif
1133511338
}
1133611339

1133711340
// record event on src stream
11341+
if (!cuda_ctx_src->copy_event) {
11342+
ggml_cuda_set_device(cuda_ctx_src->device);
11343+
CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming));
11344+
}
11345+
1133811346
CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream()));
1133911347

1134011348
// wait on dst stream for the copy to complete
@@ -11530,6 +11538,9 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
1153011538
}
1153111539

1153211540
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
11541+
#ifdef GGML_CUDA_NO_PEER_COPY
11542+
return nullptr;
11543+
#else
1153311544
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
1153411545

1153511546
ggml_cuda_set_device(cuda_ctx->device);
@@ -11541,6 +11552,7 @@ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend)
1154111552
/* .backend = */ backend,
1154211553
/* .context = */ event,
1154311554
};
11555+
#endif
1154411556
}
1154511557

1154611558
static void ggml_backend_cuda_event_free(ggml_backend_event_t event) {

0 commit comments

Comments
 (0)