Skip to content

Commit 7a6c70f

Browse files
BodhiHuBodhi Hu
authored andcommitted
MUSA: support ARM64 and enable dp4a .etc (ggml-org#11843)
* MUSA: support ARM64 and enable __dp4a .etc * fix cross entropy loss op for musa * update * add cc info log for musa * add comment for the MUSA .cc calculation block --------- Co-authored-by: Bodhi Hu <[email protected]>
1 parent e84c494 commit 7a6c70f

File tree

7 files changed

+25
-15
lines changed

7 files changed

+25
-15
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -847,7 +847,7 @@ ifdef GGML_MUSA
847847
CXX := $(MUSA_PATH)/bin/clang++
848848
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
849849

850-
MUSAFLAGS = -x musa -mtgpu
850+
MUSAFLAGS = -fsigned-char -x musa -mtgpu
851851
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
852852

853853
ifdef GGML_CUDA_FORCE_MMQ

docs/build.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,14 @@ This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GP
206206
cmake --build build --config Release
207207
```
208208

209+
For static build:
210+
211+
```bash
212+
cmake -B build -DGGML_MUSA=ON \
213+
-DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
214+
cmake --build build --config Release
215+
```
216+
209217
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
210218

211219
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.

ggml/src/ggml-cuda/common.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -411,13 +411,13 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
411411

412412
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
413413

414-
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
414+
#if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
415415
return __dp4a(a, b, c);
416-
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
416+
#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
417417
const int8_t * a8 = (const int8_t *) &a;
418418
const int8_t * b8 = (const int8_t *) &b;
419419
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
420-
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A
420+
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
421421

422422
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
423423
}

ggml/src/ggml-cuda/cross-entropy-loss.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,13 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
123123
ggml_cuda_pool_alloc<float> dst_tmp(pool, blocks_num.x);
124124

125125
if (nbytes_shared <= smpbo) {
126-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
126+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
127127
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
128128
if (!shared_memory_limit_raised[id]) {
129-
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
129+
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
130130
shared_memory_limit_raised[id] = true;
131131
}
132-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
132+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
133133
cross_entropy_loss_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
134134
} else {
135135
cross_entropy_loss_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
@@ -175,13 +175,13 @@ void ggml_cuda_cross_entropy_loss_back(ggml_backend_cuda_context & ctx, ggml_ten
175175
const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
176176

177177
if (nbytes_shared <= smpbo) {
178-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
178+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
179179
static bool shared_memory_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
180180
if (!shared_memory_limit_raised[id]) {
181181
CUDA_CHECK(cudaFuncSetAttribute(cross_entropy_loss_back_f32<true>, cudaFuncAttributeMaxDynamicSharedMemorySize, smpbo));
182182
shared_memory_limit_raised[id] = true;
183183
}
184-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
184+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
185185
cross_entropy_loss_back_f32<true><<<blocks_num, blocks_dim, nbytes_shared, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);
186186
} else {
187187
cross_entropy_loss_back_f32<false><<<blocks_num, blocks_dim, 0, stream>>>(grad_d, src0f_d, src1f_d, dst_d, ne00);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,12 @@ static ggml_cuda_device_info ggml_cuda_init() {
261261
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
262262
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
263263
device_vmm ? "yes" : "no", prop.warpSize);
264+
#elif defined(GGML_USE_MUSA)
265+
// TODO: refine the .cc to reflect MUSA's actual CC capabilities
266+
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
267+
info.devices[id].cc = 100*prop.major + 10*prop.minor;
268+
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
269+
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
264270
#else
265271
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
266272
info.devices[id].cc = 100*prop.major + 10*prop.minor;
@@ -1782,9 +1788,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17821788
}
17831789
}
17841790
#else
1785-
#ifdef GGML_USE_MUSA
1786-
GGML_ASSERT(false);
1787-
#else // !GGML_USE_MUSA
17881791
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
17891792
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
17901793
// use cublasGemmStridedBatchedEx
@@ -1827,7 +1830,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18271830
cu_compute_type,
18281831
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
18291832
}
1830-
#endif // GGML_USE_MUSA
18311833
#endif
18321834

18331835
if (dst->op_params[0] == GGML_PREC_DEFAULT) {

ggml/src/ggml-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <arm_sve.h>
1717
#endif // __ARM_FEATURE_SVE
1818

19-
#if defined(__ARM_NEON) && !defined(__CUDACC__)
19+
#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
2020
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
2121
//
2222
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/

ggml/src/ggml-musa/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ if (MUSAToolkit_FOUND)
4949

5050
set_source_files_properties(${GGML_SOURCES_MUSA} PROPERTIES LANGUAGE CXX)
5151
foreach(SOURCE ${GGML_SOURCES_MUSA})
52-
set(COMPILE_FLAGS "-x musa -mtgpu")
52+
set(COMPILE_FLAGS "-fsigned-char -x musa -mtgpu")
5353
foreach(ARCH ${MUSA_ARCHITECTURES})
5454
set(COMPILE_FLAGS "${COMPILE_FLAGS} --cuda-gpu-arch=mp_${ARCH}")
5555
endforeach()

0 commit comments

Comments
 (0)