Skip to content

Commit 8ba5336

Browse files
authored
Merge pull request #2422 from pbalcer/bench-stability
[benchmarks] add stddev calculation and outlier elimination
2 parents 43d33e2 + 3496c02 commit 8ba5336

File tree

5 files changed

+104
-39
lines changed

5 files changed

+104
-39
lines changed

scripts/benchmarks/benches/base.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,9 +71,6 @@ def run(self, env_vars) -> list[Result]:
7171
def teardown(self):
7272
raise NotImplementedError()
7373

74-
def ignore_iterations(self):
75-
return False
76-
7774
class Suite:
7875
def benchmarks(self) -> list[Benchmark]:
7976
raise NotImplementedError()

scripts/benchmarks/benches/llamacpp.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,6 @@ def name(self):
7676
def lower_is_better(self):
7777
return False
7878

79-
def ignore_iterations(self):
80-
return True
81-
8279
def run(self, env_vars) -> list[Result]:
8380
command = [
8481
f"{self.benchmark_bin}",

scripts/benchmarks/benches/options.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,16 @@ class Options:
1515
rebuild: bool = True
1616
benchmark_cwd: str = "INVALID"
1717
timeout: float = 600
18-
iterations: int = 5
18+
iterations: int = 3
1919
verbose: bool = False
2020
compare: Compare = Compare.LATEST
2121
compare_max: int = 10 # average/median over how many results
2222
output_html: bool = False
2323
output_markdown: bool = True
2424
dry_run: bool = False
25+
# these two should probably be merged into one setting
26+
stddev_threshold: float = 0.02
27+
epsilon: float = 0.02
2528

2629
options = Options()
2730

scripts/benchmarks/benches/result.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class Result:
2323
lower_is_better: bool = True
2424
git_hash: str = ''
2525
date: Optional[datetime] = None
26+
stddev: float = 0.0
2627

2728
@dataclass_json
2829
@dataclass

scripts/benchmarks/main.py

Lines changed: 99 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,97 @@
1818

1919
import argparse
2020
import re
21+
import statistics
2122

2223
# Update this if you are changing the layout of the results files
2324
INTERNAL_WORKDIR_VERSION = '2.0'
2425

26+
def run_iterations(benchmark: Benchmark, env_vars, iters: int, results: dict[str, list[Result]]):
27+
for iter in range(iters):
28+
print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
29+
bench_results = benchmark.run(env_vars)
30+
if bench_results is None:
31+
print(f"did not finish (OK for sycl-bench).")
32+
break
33+
34+
for bench_result in bench_results:
35+
# TODO: report failures in markdown/html ?
36+
if not bench_result.passed:
37+
print(f"complete ({bench_result.label}: verification FAILED)")
38+
continue
39+
40+
print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
41+
42+
bench_result.name = bench_result.label
43+
bench_result.lower_is_better = benchmark.lower_is_better()
44+
45+
if bench_result.label not in results:
46+
results[bench_result.label] = []
47+
48+
results[bench_result.label].append(bench_result)
49+
50+
# https://www.statology.org/modified-z-score/
51+
def modified_z_score(values: list[float]) -> list[float]:
52+
median = statistics.median(values)
53+
mad = statistics.median([abs(v - median) for v in values])
54+
if mad == 0:
55+
return [0] * len(values)
56+
return [(0.6745 * (v - median)) / mad for v in values]
57+
58+
def remove_outliers(results: dict[str, list[Result]], threshold: float = 3.5) -> dict[str, list[Result]]:
59+
new_results = {}
60+
for key, rlist in results.items():
61+
# don't eliminate outliers on first pass
62+
if len(rlist) <= options.iterations:
63+
new_results[key] = rlist
64+
continue
65+
66+
values = [r.value for r in rlist]
67+
z_scores = modified_z_score(values)
68+
filtered_rlist = [r for r, z in zip(rlist, z_scores) if abs(z) <= threshold]
69+
70+
if not filtered_rlist:
71+
new_results[key] = rlist
72+
else:
73+
new_results[key] = filtered_rlist
74+
75+
return new_results
76+
77+
def process_results(results: dict[str, list[Result]]) -> tuple[bool, list[Result]]:
78+
processed: list[Result] = []
79+
# technically, we can detect whether result is below or above threshold per
80+
# individual result. However, we can't repeat benchmark runs with that
81+
# granularity. So we just reject all results and try again.
82+
valid_results = True # above stddev threshold
83+
84+
for label, rlist in remove_outliers(results).items():
85+
if (len(rlist) == 0):
86+
continue
87+
88+
if len(rlist) == 1:
89+
processed.append(rlist[0])
90+
continue
91+
92+
values = [r.value for r in rlist]
93+
94+
mean_value = statistics.mean(values)
95+
stddev = statistics.stdev(values)
96+
97+
threshold = options.stddev_threshold * mean_value
98+
99+
if stddev > threshold:
100+
print(f"stddev {stddev} above the threshold {threshold} for {label}")
101+
valid_results = False
102+
103+
rlist.sort(key=lambda res: res.value)
104+
median_index = len(rlist) // 2
105+
median_result = rlist[median_index]
106+
median_result.stddev = stddev
107+
108+
processed.append(median_result)
109+
110+
return valid_results, processed
111+
25112
def main(directory, additional_env_vars, save_name, compare_names, filter):
26113
prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
27114

@@ -65,36 +152,15 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
65152
for benchmark in benchmarks:
66153
try:
67154
merged_env_vars = {**additional_env_vars}
68-
iteration_results = []
69-
iterations = options.iterations if not benchmark.ignore_iterations() else 1
70-
for iter in range(iterations):
71-
print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
72-
bench_results = benchmark.run(merged_env_vars)
73-
if bench_results is not None:
74-
for bench_result in bench_results:
75-
if bench_result.passed:
76-
print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
77-
else:
78-
print(f"complete ({bench_result.label}: verification FAILED)")
79-
iteration_results.append(bench_result)
80-
else:
81-
print(f"did not finish (OK for sycl-bench).")
155+
intermediate_results: dict[str, list[Result]] = {}
156+
processed: list[Result] = []
157+
for _ in range(5):
158+
run_iterations(benchmark, merged_env_vars, options.iterations, intermediate_results)
159+
valid, processed = process_results(intermediate_results)
160+
if valid:
82161
break
162+
results += processed
83163

84-
if len(iteration_results) == 0:
85-
continue
86-
87-
for label in set([result.label for result in iteration_results]):
88-
label_results = [result for result in iteration_results if result.label == label and result.passed == True]
89-
if len(label_results) > 0:
90-
label_results.sort(key=lambda res: res.value)
91-
median_index = len(label_results) // 2
92-
median_result = label_results[median_index]
93-
94-
median_result.name = label
95-
median_result.lower_is_better = benchmark.lower_is_better()
96-
97-
results.append(median_result)
98164
except Exception as e:
99165
if options.exit_on_failure:
100166
raise e
@@ -164,14 +230,15 @@ def validate_and_parse_env_args(env_args):
164230
parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
165231
parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
166232
parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
167-
parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5)
168-
parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600)
233+
parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
234+
parser.add_argument("--stddev-threshold", type=float, help='If stddev % is above this threshold, rerun all iterations', default=options.stddev_threshold)
235+
parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
169236
parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None)
170-
parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=0.005)
237+
parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=options.epsilon)
171238
parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true")
172239
parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
173240
parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
174-
parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=10)
241+
parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
175242
parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
176243
parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
177244
parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)

0 commit comments

Comments
 (0)