Skip to content

Commit 9b169a4

Browse files
authored
vulkan: fix mul_mat_vec failure in backend tests (#12529)
The OOB calculation could be wrong if the last iteration was during one of the unrolled loops. Adjust the unrolling counts to avoid this. Add a couple new backend tests that hit this failure on NVIDIA GPUs.
1 parent 77f9c6b commit 9b169a4

File tree

2 files changed

+22
-0
lines changed

2 files changed

+22
-0
lines changed

ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,16 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
105105
int unroll_count = 4;
106106
uint unrolled_iters = num_iters & ~(unroll_count - 1);
107107

108+
#if K_PER_ITER == 2
109+
// If the K dimension is odd, we need lastiter==true on the last iteration
110+
// so OOB is computed correctly. Skip some unrolling to make that happen.
111+
if ((p.ncols & 1) != 0 &&
112+
unrolled_iters == num_iters &&
113+
unrolled_iters > 0) {
114+
unrolled_iters -= unroll_count;
115+
}
116+
#endif
117+
108118
uint i = 0;
109119
while (i < unrolled_iters) {
110120
// Manually partially unroll the loop
@@ -113,8 +123,18 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
113123
i++;
114124
}
115125
}
126+
116127
unroll_count = 2;
117128
unrolled_iters = num_iters & ~(unroll_count - 1);
129+
130+
#if K_PER_ITER == 2
131+
if ((p.ncols & 1) != 0 &&
132+
unrolled_iters == num_iters &&
133+
unrolled_iters > 0) {
134+
unrolled_iters -= unroll_count;
135+
}
136+
#endif
137+
118138
while (i < unrolled_iters) {
119139
// Manually partially unroll the loop
120140
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {

tests/test-backend-ops.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4204,6 +4204,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
42044204
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
42054205
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
42064206
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
4207+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
4208+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
42074209

42084210
for (auto bs : {1,2,4,8}) {
42094211
for (auto nr : {1,4}) {

0 commit comments

Comments
 (0)