Skip to content

Commit a124a4f

Browse files
author
s50048389
committed
Stuff for testing
1 parent 5399d8a commit a124a4f

File tree

2 files changed

+93
-11
lines changed

2 files changed

+93
-11
lines changed

ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ void main() {
315315
#else
316316
ACC_TYPE_VEC2 sums[WMITER * TM * WNITER * TN/2];
317317
FLOAT_TYPE_VEC2 cache_a[WMITER * TM];
318-
FLOAT_TYPE_VEC2 cache_b[TN];
318+
FLOAT_TYPE_VEC2 cache_b;
319319

320320
[[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN/2; i++) {
321321
sums[i] = ACC_TYPE_VEC2(0.0f, 0.0f);
@@ -360,21 +360,40 @@ void main() {
360360
cache_a[wsir * TM + j] = buf_a[(warp_r * WM + wsir * WSUBM + tiwr * TM + j) * SHMEM_STRIDE + i];
361361
}
362362
}
363-
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
364-
[[unroll]] for (uint j = 0; j < TN; j++) {
365-
cache_b[j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
366-
}
363+
// [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
364+
// [[unroll]] for (uint j = 0; j < TN; j++) {
365+
// cache_b[j] = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + j) * SHMEM_STRIDE + i];
366+
// }
367+
368+
// [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
369+
// [[unroll]] for (uint cc = 0; cc < TN; cc++) {
370+
// [[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
371+
// const uint sums_idx = (cr * WNITER + wsic) * (WMITER * TN) + cc * WMITER + wsir;
372+
// sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b[cc].x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b[cc].y), sums[sums_idx].x));
373+
// sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b[cc].x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b[cc].y), sums[sums_idx].y));
374+
// }
375+
// }
376+
// }
377+
// }
367378

368-
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
369-
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
379+
[[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) {
380+
[[unroll]] for (uint cc = 0; cc < TN; cc++) {
381+
cache_b = buf_b[(warp_c * WN + wsic * WSUBN + tiwc * TN + cc) * SHMEM_STRIDE + i];
382+
383+
[[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) {
370384
[[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
371-
const uint sums_idx = (cr * WNITER + wsic) * (WMITER * TN) + cc * WMITER + wsir;
372-
sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b[cc].x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b[cc].y), sums[sums_idx].x));
373-
sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b[cc].x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b[cc].y), sums[sums_idx].y));
385+
// [TM / 2][WNITER][TN][WMITER]
386+
// const uint sums_idx = (cr * WNITER + wsic) * (WMITER * TN) + cc * WMITER + wsir;
387+
388+
// [WNITER][TN][WMITER][TM / 2] -> [wsic][]
389+
const uint sums_idx = (wsic * TN + cc) * WMITER * (TM / 2) + wsir * (TM / 2) + cr;
390+
sums[sums_idx].x = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr ].y), ACC_TYPE(cache_b.y), sums[sums_idx].x));
391+
sums[sums_idx].y = fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].x), ACC_TYPE(cache_b.x), fma(ACC_TYPE(cache_a[wsir * TM + 2 * cr + 1].y), ACC_TYPE(cache_b.y), sums[sums_idx].y));
374392
}
375393
}
376394
}
377395
}
396+
378397
}
379398
#endif
380399

@@ -466,7 +485,7 @@ void main() {
466485
const u16vec2 row_idx = row_ids[row_i - ic * BN];
467486
#endif // MUL_MAT_ID
468487
[[unroll]] for (uint cr = 0; cr < TM / 2; cr++) {
469-
const uint sums_idx = (cr * WNITER + wsic) * (WMITER * TN) + cc * WMITER + wsir;
488+
const uint sums_idx = (wsic * TN + cc) * WMITER * (TM / 2) + wsir * (TM / 2) + cr;
470489
#ifdef MUL_MAT_ID
471490
if (dr_warp + 2 * cr < p.M) {
472491
data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + 2 * cr] = D_TYPE(sums[sums_idx].x);

stefan.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/usr/bin/env python3
2+
3+
import subprocess
4+
import re
5+
from datetime import datetime
6+
7+
def run_benchmark(command):
8+
print(f" Running: {command}")
9+
result = subprocess.run(command, shell=True, capture_output=True, text=True)
10+
return result.stdout
11+
12+
def parse_output(output):
13+
pattern = re.compile(r"^(MUL_MAT(?:_ID)?\(.*?\)):\s+\d+\s+runs\s+-\s+([\d.]+)\s+us/run")
14+
perf_data = {}
15+
16+
for line in output.splitlines():
17+
match = pattern.match(line.strip())
18+
if match:
19+
kernel = match.group(1)
20+
us_run = float(match.group(2))
21+
perf_data[kernel] = us_run
22+
return perf_data
23+
24+
def generate_markdown(before, after, label_before, label_after):
25+
from datetime import datetime
26+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
27+
filename = f"perf_comparison_{timestamp}.md"
28+
29+
lines = [
30+
f"# Performance Comparison",
31+
f"Comparing `{label_before}` vs `{label_after}`\n",
32+
"| Kernel | {0} (us/run) | {1} (us/run) | Δ % |".format(label_before, label_after),
33+
"|--------|--------------|--------------|-----|"
34+
]
35+
36+
for kernel in before: # preserve original order from 'before'
37+
val1 = before.get(kernel)
38+
val2 = after.get(kernel)
39+
40+
if val1 is not None and val2 is not None:
41+
delta = ((val2 - val1) / val1) * 100
42+
lines.append(f"| `{kernel}` | {val1:.2f} | {val2:.2f} | {delta:+.2f}% |")
43+
elif val1 is not None:
44+
lines.append(f"| `{kernel}` | {val1:.2f} | N/A | N/A |")
45+
else:
46+
lines.append(f"| `{kernel}` | N/A | {val2:.2f} | N/A |")
47+
48+
with open(filename, 'w') as f:
49+
f.write('\n'.join(lines))
50+
print(f"Markdown report saved to: {filename}")
51+
52+
if __name__ == "__main__":
53+
# Customize these two commands
54+
cmd_before = '/home/stefan/sabac/llama.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o MUL_MAT'
55+
cmd_after = '/home/stefan/final/llama-stefan.cpp/./build/bin/test-backend-ops perf -b Vulkan0 -o MUL_MAT'
56+
57+
output_before = run_benchmark(cmd_before)
58+
output_after = run_benchmark(cmd_after)
59+
60+
data_before = parse_output(output_before)
61+
data_after = parse_output(output_after)
62+
63+
generate_markdown(data_before, data_after, "MUL_MAT", "MUL_MAT")

0 commit comments

Comments
 (0)