Skip to content

Commit 5dbc2b3

Browse files
authored
Enable build with CUDA 11.0 (make) (#3132)
* CUDA 11.0 fixes * Cleaner CUDA/host flags separation Also renamed GGML_ASSUME into GGML_CUDA_ASSUME
1 parent b08e75b commit 5dbc2b3

File tree

2 files changed

+78
-62
lines changed

2 files changed

+78
-62
lines changed

Makefile

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,19 @@ CXXV := $(shell $(CXX) --version | head -n 1)
9595
#
9696

9797
# keep standard at C11 and C++11
98+
MK_CPPFLAGS = -I. -Icommon
99+
MK_CFLAGS = -std=c11 -fPIC
100+
MK_CXXFLAGS = -std=c++11 -fPIC
101+
98102
# -Ofast tends to produce faster code, but may not be available for some compilers.
99103
ifdef LLAMA_FAST
100-
OPT = -Ofast
104+
MK_CFLAGS += -Ofast
105+
MK_HOST_CXXFLAGS += -Ofast
106+
MK_CUDA_CXXFLAGS += -O3
101107
else
102-
OPT = -O3
108+
MK_CFLAGS += -O3
109+
MK_CXXFLAGS += -O3
103110
endif
104-
MK_CPPFLAGS = -I. -Icommon
105-
MK_CFLAGS = $(OPT) -std=c11 -fPIC
106-
MK_CXXFLAGS = $(OPT) -std=c++11 -fPIC
107-
MK_LDFLAGS =
108111

109112
# clock_gettime came in POSIX.1b (1993)
110113
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
@@ -232,7 +235,7 @@ ifndef RISCV
232235
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
233236
# Use all CPU extensions that are available:
234237
MK_CFLAGS += -march=native -mtune=native
235-
MK_CXXFLAGS += -march=native -mtune=native
238+
MK_HOST_CXXFLAGS += -march=native -mtune=native
236239

237240
# Usage AVX-only
238241
#MK_CFLAGS += -mfma -mf16c -mavx
@@ -372,7 +375,7 @@ ifdef LLAMA_CUDA_CCBIN
372375
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
373376
endif
374377
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
375-
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
378+
$(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
376379
endif # LLAMA_CUBLAS
377380

378381
ifdef LLAMA_CLBLAST
@@ -440,23 +443,30 @@ k_quants.o: k_quants.c k_quants.h
440443
endif # LLAMA_NO_K_QUANTS
441444

442445
# combine build flags with cmdline overrides
443-
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
444-
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
445-
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
446+
override CFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
447+
override CXXFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
448+
override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
449+
override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
450+
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
451+
452+
# save CXXFLAGS before we add host-only options
453+
NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
454+
override CXXFLAGS += $(HOST_CXXFLAGS)
446455

447456
#
448457
# Print build information
449458
#
450459

451460
$(info I llama.cpp build info: )
452-
$(info I UNAME_S: $(UNAME_S))
453-
$(info I UNAME_P: $(UNAME_P))
454-
$(info I UNAME_M: $(UNAME_M))
455-
$(info I CFLAGS: $(CFLAGS))
456-
$(info I CXXFLAGS: $(CXXFLAGS))
457-
$(info I LDFLAGS: $(LDFLAGS))
458-
$(info I CC: $(CCV))
459-
$(info I CXX: $(CXXV))
461+
$(info I UNAME_S: $(UNAME_S))
462+
$(info I UNAME_P: $(UNAME_P))
463+
$(info I UNAME_M: $(UNAME_M))
464+
$(info I CFLAGS: $(CFLAGS))
465+
$(info I CXXFLAGS: $(CXXFLAGS))
466+
$(info I NVCCFLAGS: $(NVCCFLAGS))
467+
$(info I LDFLAGS: $(LDFLAGS))
468+
$(info I CC: $(CCV))
469+
$(info I CXX: $(CXXV))
460470
$(info )
461471

462472
#

ggml-cuda.cu

Lines changed: 49 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
6262
#define cudaStreamNonBlocking hipStreamNonBlocking
6363
#define cudaStreamSynchronize hipStreamSynchronize
64-
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
64+
#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
6565
#define cudaStream_t hipStream_t
6666
#define cudaSuccess hipSuccess
6767
#else
@@ -190,6 +190,12 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
190190
} while (0)
191191
#endif // CUDART_VERSION >= 11
192192

193+
#if CUDART_VERSION >= 11100
194+
#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
195+
#else
196+
#define GGML_CUDA_ASSUME(x)
197+
#endif // CUDART_VERSION >= 11100
198+
193199
#ifdef GGML_CUDA_F16
194200
typedef half dfloat; // dequantize float
195201
typedef half2 dfloat2;
@@ -2145,10 +2151,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
21452151
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
21462152
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
21472153

2148-
__builtin_assume(i_offset >= 0);
2149-
__builtin_assume(i_offset < nwarps);
2150-
__builtin_assume(k >= 0);
2151-
__builtin_assume(k < WARP_SIZE);
2154+
GGML_CUDA_ASSUME(i_offset >= 0);
2155+
GGML_CUDA_ASSUME(i_offset < nwarps);
2156+
GGML_CUDA_ASSUME(k >= 0);
2157+
GGML_CUDA_ASSUME(k < WARP_SIZE);
21522158

21532159
const int kbx = k / QI4_0;
21542160
const int kqsx = k % QI4_0;
@@ -2239,10 +2245,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
22392245
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
22402246
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
22412247

2242-
__builtin_assume(i_offset >= 0);
2243-
__builtin_assume(i_offset < nwarps);
2244-
__builtin_assume(k >= 0);
2245-
__builtin_assume(k < WARP_SIZE);
2248+
GGML_CUDA_ASSUME(i_offset >= 0);
2249+
GGML_CUDA_ASSUME(i_offset < nwarps);
2250+
GGML_CUDA_ASSUME(k >= 0);
2251+
GGML_CUDA_ASSUME(k < WARP_SIZE);
22462252

22472253
const int kbx = k / QI4_1;
22482254
const int kqsx = k % QI4_1;
@@ -2331,10 +2337,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
23312337
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
23322338
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
23332339

2334-
__builtin_assume(i_offset >= 0);
2335-
__builtin_assume(i_offset < nwarps);
2336-
__builtin_assume(k >= 0);
2337-
__builtin_assume(k < WARP_SIZE);
2340+
GGML_CUDA_ASSUME(i_offset >= 0);
2341+
GGML_CUDA_ASSUME(i_offset < nwarps);
2342+
GGML_CUDA_ASSUME(k >= 0);
2343+
GGML_CUDA_ASSUME(k < WARP_SIZE);
23382344

23392345
const int kbx = k / QI5_0;
23402346
const int kqsx = k % QI5_0;
@@ -2445,10 +2451,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
24452451
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
24462452
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
24472453

2448-
__builtin_assume(i_offset >= 0);
2449-
__builtin_assume(i_offset < nwarps);
2450-
__builtin_assume(k >= 0);
2451-
__builtin_assume(k < WARP_SIZE);
2454+
GGML_CUDA_ASSUME(i_offset >= 0);
2455+
GGML_CUDA_ASSUME(i_offset < nwarps);
2456+
GGML_CUDA_ASSUME(k >= 0);
2457+
GGML_CUDA_ASSUME(k < WARP_SIZE);
24522458

24532459
const int kbx = k / QI5_1;
24542460
const int kqsx = k % QI5_1;
@@ -2551,10 +2557,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
25512557
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
25522558
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
25532559

2554-
__builtin_assume(i_offset >= 0);
2555-
__builtin_assume(i_offset < nwarps);
2556-
__builtin_assume(k >= 0);
2557-
__builtin_assume(k < WARP_SIZE);
2560+
GGML_CUDA_ASSUME(i_offset >= 0);
2561+
GGML_CUDA_ASSUME(i_offset < nwarps);
2562+
GGML_CUDA_ASSUME(k >= 0);
2563+
GGML_CUDA_ASSUME(k < WARP_SIZE);
25582564

25592565
const int kbx = k / QI8_0;
25602566
const int kqsx = k % QI8_0;
@@ -2642,10 +2648,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
26422648
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
26432649
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
26442650

2645-
__builtin_assume(i_offset >= 0);
2646-
__builtin_assume(i_offset < nwarps);
2647-
__builtin_assume(k >= 0);
2648-
__builtin_assume(k < WARP_SIZE);
2651+
GGML_CUDA_ASSUME(i_offset >= 0);
2652+
GGML_CUDA_ASSUME(i_offset < nwarps);
2653+
GGML_CUDA_ASSUME(k >= 0);
2654+
GGML_CUDA_ASSUME(k < WARP_SIZE);
26492655

26502656
const int kbx = k / QI2_K;
26512657
const int kqsx = k % QI2_K;
@@ -2763,10 +2769,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
27632769
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
27642770
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
27652771

2766-
__builtin_assume(i_offset >= 0);
2767-
__builtin_assume(i_offset < nwarps);
2768-
__builtin_assume(k >= 0);
2769-
__builtin_assume(k < WARP_SIZE);
2772+
GGML_CUDA_ASSUME(i_offset >= 0);
2773+
GGML_CUDA_ASSUME(i_offset < nwarps);
2774+
GGML_CUDA_ASSUME(k >= 0);
2775+
GGML_CUDA_ASSUME(k < WARP_SIZE);
27702776

27712777
const int kbx = k / QI3_K;
27722778
const int kqsx = k % QI3_K;
@@ -2981,10 +2987,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
29812987
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
29822988
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
29832989

2984-
__builtin_assume(i_offset >= 0);
2985-
__builtin_assume(i_offset < nwarps);
2986-
__builtin_assume(k >= 0);
2987-
__builtin_assume(k < WARP_SIZE);
2990+
GGML_CUDA_ASSUME(i_offset >= 0);
2991+
GGML_CUDA_ASSUME(i_offset < nwarps);
2992+
GGML_CUDA_ASSUME(k >= 0);
2993+
GGML_CUDA_ASSUME(k < WARP_SIZE);
29882994

29892995
const int kbx = k / QI4_K; // == 0 if QK_K == 256
29902996
const int kqsx = k % QI4_K; // == k if QK_K == 256
@@ -3162,10 +3168,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
31623168
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
31633169
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
31643170

3165-
__builtin_assume(i_offset >= 0);
3166-
__builtin_assume(i_offset < nwarps);
3167-
__builtin_assume(k >= 0);
3168-
__builtin_assume(k < WARP_SIZE);
3171+
GGML_CUDA_ASSUME(i_offset >= 0);
3172+
GGML_CUDA_ASSUME(i_offset < nwarps);
3173+
GGML_CUDA_ASSUME(k >= 0);
3174+
GGML_CUDA_ASSUME(k < WARP_SIZE);
31693175

31703176
const int kbx = k / QI5_K; // == 0 if QK_K == 256
31713177
const int kqsx = k % QI5_K; // == k if QK_K == 256
@@ -3291,10 +3297,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
32913297
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
32923298
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
32933299

3294-
__builtin_assume(i_offset >= 0);
3295-
__builtin_assume(i_offset < nwarps);
3296-
__builtin_assume(k >= 0);
3297-
__builtin_assume(k < WARP_SIZE);
3300+
GGML_CUDA_ASSUME(i_offset >= 0);
3301+
GGML_CUDA_ASSUME(i_offset < nwarps);
3302+
GGML_CUDA_ASSUME(k >= 0);
3303+
GGML_CUDA_ASSUME(k < WARP_SIZE);
32983304

32993305
const int kbx = k / QI6_K; // == 0 if QK_K == 256
33003306
const int kqsx = k % QI6_K; // == k if QK_K == 256
@@ -6408,7 +6414,7 @@ static void ggml_cuda_op_mul_mat(
64086414

64096415
// wait for main GPU data if necessary
64106416
if (split && (id != g_main_device || is != 0)) {
6411-
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0]));
6417+
CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0));
64126418
}
64136419

64146420
for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
@@ -6530,7 +6536,7 @@ static void ggml_cuda_op_mul_mat(
65306536
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
65316537
for (int64_t id = 0; id < g_device_count; ++id) {
65326538
for (int64_t is = 0; is < is_max; ++is) {
6533-
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is]));
6539+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
65346540
}
65356541
}
65366542
}

0 commit comments

Comments
 (0)