@@ -597,11 +597,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
597
597
fprintf (stdout, " number of layers to store in VRAM\n " );
598
598
fprintf (stdout, " -ts SPLIT --tensor-split SPLIT\n " );
599
599
fprintf (stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n " );
600
- fprintf (stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
601
- fprintf (stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
602
- fprintf (stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n " );
603
- fprintf (stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n " );
604
- fprintf (stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n " );
600
+ fprintf (stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n " );
601
+ fprintf (stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n " );
602
+ #ifdef GGML_USE_CUBLAS
603
+ fprintf (stdout, " -nommq, --no-mul-mat-q\n " );
604
+ fprintf (stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n " );
605
+ fprintf (stdout, " Not recommended since this is both slower and uses more VRAM.\n " );
606
+ #endif // GGML_USE_CUBLAS
605
607
#endif
606
608
fprintf (stdout, " --mtest compute maximum memory usage\n " );
607
609
fprintf (stdout, " --export export the computation graph to 'llama.ggml'\n " );
0 commit comments