CUDA: stream-k decomposition for MMQ (#8018)

JohannesGaessler · web-flow · commit d50f8897a797 · 2024-06-20T14:39:21.000+02:00
* CUDA: stream-k decomposition for MMQ

* fix undefined memory reads for small matrices
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
         }
 
         const int cc = ggml_cuda_info().devices[id].cc;
-        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
+        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
     }
     return row_rounding;
 }
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
@@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) {
 }
 
 // Round rows to this value for --split-mode row:
-static int get_mmq_y_host(const int cc, const int mmq_x) {
-    return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
+static int get_mmq_y_host(const int cc) {
+    return cc >= CC_VOLTA ? 128 : 64;
 }
 
 //////////////////////
diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu
@@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
             break;
         case GGML_TYPE_Q4_1:
-            mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
             break;
         case GGML_TYPE_Q5_0:
-            mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
             break;
         case GGML_TYPE_Q5_1:
-            mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
             break;
         case GGML_TYPE_Q8_0:
-            mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
             break;
         case GGML_TYPE_Q2_K:
-            mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
             break;
         case GGML_TYPE_Q3_K:
-            mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
             break;
         case GGML_TYPE_Q4_K:
-            mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
             break;
         case GGML_TYPE_Q5_K:
-            mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
             break;
         case GGML_TYPE_Q6_K:
-            mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
+            mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
             break;
         default:
             GGML_ASSERT(false);
diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh

Original file line number	Diff line number	Diff line change
`@@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &`
`635`	`635`	`}`
`636`	`636`
`637`	`637`	`const int cc = ggml_cuda_info().devices[id].cc;`
`638`		`- row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));`
	`638`	`+ row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));`
`639`	`639`	`}`
`640`	`640`	`return row_rounding;`
`641`	`641`	`}`
Original file line number	Diff line number	Diff line change
`@@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) {`
`652`	`652`	`}`
`653`	`653`
`654`	`654`	`// Round rows to this value for --split-mode row:`
`655`		`-static int get_mmq_y_host(const int cc, const int mmq_x) {`
`656`		`- return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;`
	`655`	`+static int get_mmq_y_host(const int cc) {`
	`656`	`+ return cc >= CC_VOLTA ? 128 : 64;`
`657`	`657`	`}`
`658`	`658`
`659`	`659`	`//////////////////////`