update

Bodhi Hu · Bodhi Hu · commit edc1630472f7 · 2025-02-19T15:43:24.000-05:00
diff --git a/docs/build.md b/docs/build.md
@@ -202,17 +202,14 @@ This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GP
 - Using `CMake`:
 
   ```bash
-  # build with MUSA and using the compilers from MUSA SDK:
-  cmake -B build -DGGML_MUSA=ON \
-    -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
+  cmake -B build -DGGML_MUSA=ON
   cmake --build build --config Release
   ```
 - For static build:
 
   ```bash
   cmake -B build -DGGML_MUSA=ON \
-    -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-    -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++
+    -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON
   cmake --build build --config Release
   ```
 
@@ -222,7 +219,6 @@ The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enab
 
 Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
 
-
 ## HIP
 
 This provides GPU acceleration on HIP-supported AMD GPUs.
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -404,7 +404,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 
 #if __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
     return __dp4a(a, b, c);
-#else
+#else // __CUDA_ARCH__ >= GGML_CUDA_CC_DP4A || defined(GGML_USE_MUSA)
     const int8_t * a8 = (const int8_t *) &a;
     const int8_t * b8 = (const int8_t *) &b;
     return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -262,8 +262,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
                       id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
                       device_vmm ? "yes" : "no", prop.warpSize);
 #elif defined(GGML_USE_MUSA)
-        // NOTE: MUSA will reserve some shared mem, and 24B should be enough,
-        // we can remove the **24** when MUSA no longer reserves shared mem.
+        // NOTE: MUSA will reserve some shared mem, and 24B should be enough
         info.devices[id].smpbo = prop.sharedMemPerBlockOptin - 24;
         info.devices[id].cc = 100*prop.major + 10*prop.minor;
 #else
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -3703,7 +3703,7 @@ void llama_model::print_info() const {
     }
 
     if (arch == LLM_ARCH_LLAMA) {
-        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
+        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
     }
 
     vocab.print_info();

Original file line number	Diff line number	Diff line change
`@@ -3703,7 +3703,7 @@ void llama_model::print_info() const {`
`3703`	`3703`	`}`
`3704`	`3704`
`3705`	`3705`	`if (arch == LLM_ARCH_LLAMA) {`
`3706`		`- LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);`
	`3706`	`+ LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);`
`3707`	`3707`	`}`
`3708`	`3708`
`3709`	`3709`	`vocab.print_info();`