Skip to content

Commit 2025020

Browse files
colesburypull[bot]
authored andcommitted
gh-125985: Add free threading scaling micro benchmarks (#125986)
These consist of a number of short snippets that help identify scaling bottlenecks in the free threaded interpreter. The current bottlenecks are in calling functions in benchmarks that call functions (due to `LOAD_ATTR` not yet using deferred reference counting) and when accessing thread-local data.
1 parent 876afee commit 2025020

File tree

1 file changed

+324
-0
lines changed

1 file changed

+324
-0
lines changed
+324
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
# This script runs a set of small benchmarks to help identify scaling
2+
# bottlenecks in the free-threaded interpreter. The benchmarks consist
3+
# of patterns that ought to scale well, but haven't in the past. This is
4+
# typically due to reference count contention or lock contention.
5+
#
6+
# This is not intended to be a general multithreading benchmark suite, nor
7+
# are the benchmarks intended to be representative of real-world workloads.
8+
#
9+
# On Linux, to avoid confounding hardware effects, the script attempts to:
10+
# * Use a single CPU socket (to avoid NUMA effects)
11+
# * Use distinct physical cores (to avoid hyperthreading/SMT effects)
12+
# * Use "performance" cores (Intel, ARM) on CPUs that have performance and
13+
# efficiency cores
14+
#
15+
# It also helps to disable dynamic frequency scaling (i.e., "Turbo Boost")
16+
#
17+
# Intel:
18+
# > echo "1" | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo
19+
#
20+
# AMD:
21+
# > echo "0" | sudo tee /sys/devices/system/cpu/cpufreq/boost
22+
#
23+
24+
import math
25+
import os
26+
import queue
27+
import sys
28+
import threading
29+
import time
30+
31+
# The iterations in individual benchmarks are scaled by this factor.
32+
WORK_SCALE = 100
33+
34+
ALL_BENCHMARKS = {}
35+
36+
threads = []
37+
in_queues = []
38+
out_queues = []
39+
40+
41+
def register_benchmark(func):
42+
ALL_BENCHMARKS[func.__name__] = func
43+
return func
44+
45+
@register_benchmark
46+
def object_cfunction():
47+
accu = 0
48+
tab = [1] * 100
49+
for i in range(1000 * WORK_SCALE):
50+
tab.pop(0)
51+
tab.append(i)
52+
accu += tab[50]
53+
return accu
54+
55+
@register_benchmark
56+
def cmodule_function():
57+
for i in range(1000 * WORK_SCALE):
58+
math.floor(i * i)
59+
60+
@register_benchmark
61+
def mult_constant():
62+
x = 1.0
63+
for i in range(3000 * WORK_SCALE):
64+
x *= 1.01
65+
66+
def simple_gen():
67+
for i in range(10):
68+
yield i
69+
70+
@register_benchmark
71+
def generator():
72+
accu = 0
73+
for i in range(100 * WORK_SCALE):
74+
for v in simple_gen():
75+
accu += v
76+
return accu
77+
78+
class Counter:
79+
def __init__(self):
80+
self.i = 0
81+
82+
def next_number(self):
83+
self.i += 1
84+
return self.i
85+
86+
@register_benchmark
87+
def pymethod():
88+
c = Counter()
89+
for i in range(1000 * WORK_SCALE):
90+
c.next_number()
91+
return c.i
92+
93+
def next_number(i):
94+
return i + 1
95+
96+
@register_benchmark
97+
def pyfunction():
98+
accu = 0
99+
for i in range(1000 * WORK_SCALE):
100+
accu = next_number(i)
101+
return accu
102+
103+
def double(x):
104+
return x + x
105+
106+
module = sys.modules[__name__]
107+
108+
@register_benchmark
109+
def module_function():
110+
total = 0
111+
for i in range(1000 * WORK_SCALE):
112+
total += module.double(i)
113+
return total
114+
115+
class MyObject:
116+
pass
117+
118+
@register_benchmark
119+
def load_string_const():
120+
accu = 0
121+
for i in range(1000 * WORK_SCALE):
122+
if i == 'a string':
123+
accu += 7
124+
else:
125+
accu += 1
126+
return accu
127+
128+
@register_benchmark
129+
def load_tuple_const():
130+
accu = 0
131+
for i in range(1000 * WORK_SCALE):
132+
if i == (1, 2):
133+
accu += 7
134+
else:
135+
accu += 1
136+
return accu
137+
138+
@register_benchmark
139+
def create_pyobject():
140+
for i in range(1000 * WORK_SCALE):
141+
o = MyObject()
142+
143+
@register_benchmark
144+
def create_closure():
145+
for i in range(1000 * WORK_SCALE):
146+
def foo(x):
147+
return x
148+
foo(i)
149+
150+
@register_benchmark
151+
def create_dict():
152+
for i in range(1000 * WORK_SCALE):
153+
d = {
154+
"key": "value",
155+
}
156+
157+
thread_local = threading.local()
158+
159+
@register_benchmark
160+
def thread_local_read():
161+
tmp = thread_local
162+
tmp.x = 10
163+
for i in range(500 * WORK_SCALE):
164+
_ = tmp.x
165+
_ = tmp.x
166+
_ = tmp.x
167+
_ = tmp.x
168+
_ = tmp.x
169+
170+
171+
def bench_one_thread(func):
172+
t0 = time.perf_counter_ns()
173+
func()
174+
t1 = time.perf_counter_ns()
175+
return t1 - t0
176+
177+
178+
def bench_parallel(func):
179+
t0 = time.perf_counter_ns()
180+
for inq in in_queues:
181+
inq.put(func)
182+
for outq in out_queues:
183+
outq.get()
184+
t1 = time.perf_counter_ns()
185+
return t1 - t0
186+
187+
188+
def benchmark(func):
189+
delta_one_thread = bench_one_thread(func)
190+
delta_many_threads = bench_parallel(func)
191+
192+
speedup = delta_one_thread * len(threads) / delta_many_threads
193+
if speedup >= 1:
194+
factor = speedup
195+
direction = "faster"
196+
else:
197+
factor = 1 / speedup
198+
direction = "slower"
199+
200+
use_color = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
201+
color = reset_color = ""
202+
if use_color:
203+
if speedup <= 1.1:
204+
color = "\x1b[31m" # red
205+
elif speedup < len(threads)/2:
206+
color = "\x1b[33m" # yellow
207+
reset_color = "\x1b[0m"
208+
209+
print(f"{color}{func.__name__:<18} {round(factor, 1):>4}x {direction}{reset_color}")
210+
211+
def determine_num_threads_and_affinity():
212+
if sys.platform != "linux":
213+
return [None] * os.cpu_count()
214+
215+
# Try to use `lscpu -p` on Linux
216+
import subprocess
217+
try:
218+
output = subprocess.check_output(["lscpu", "-p=cpu,node,core,MAXMHZ"],
219+
text=True, env={"LC_NUMERIC": "C"})
220+
except (FileNotFoundError, subprocess.CalledProcessError):
221+
return [None] * os.cpu_count()
222+
223+
table = []
224+
for line in output.splitlines():
225+
if line.startswith("#"):
226+
continue
227+
cpu, node, core, maxhz = line.split(",")
228+
if maxhz == "":
229+
maxhz = "0"
230+
table.append((int(cpu), int(node), int(core), float(maxhz)))
231+
232+
cpus = []
233+
cores = set()
234+
max_mhz_all = max(row[3] for row in table)
235+
for cpu, node, core, maxmhz in table:
236+
# Choose only CPUs on the same node, unique cores, and try to avoid
237+
# "efficiency" cores.
238+
if node == 0 and core not in cores and maxmhz == max_mhz_all:
239+
cpus.append(cpu)
240+
cores.add(core)
241+
return cpus
242+
243+
244+
def thread_run(cpu, in_queue, out_queue):
245+
if cpu is not None and hasattr(os, "sched_setaffinity"):
246+
# Set the affinity for the current thread
247+
os.sched_setaffinity(0, (cpu,))
248+
249+
while True:
250+
func = in_queue.get()
251+
if func is None:
252+
break
253+
func()
254+
out_queue.put(None)
255+
256+
257+
def initialize_threads(opts):
258+
if opts.threads == -1:
259+
cpus = determine_num_threads_and_affinity()
260+
else:
261+
cpus = [None] * opts.threads # don't set affinity
262+
263+
print(f"Running benchmarks with {len(cpus)} threads")
264+
for cpu in cpus:
265+
inq = queue.Queue()
266+
outq = queue.Queue()
267+
in_queues.append(inq)
268+
out_queues.append(outq)
269+
t = threading.Thread(target=thread_run, args=(cpu, inq, outq), daemon=True)
270+
threads.append(t)
271+
t.start()
272+
273+
274+
def main(opts):
275+
global WORK_SCALE
276+
if not hasattr(sys, "_is_gil_enabled") or sys._is_gil_enabled():
277+
sys.stderr.write("expected to be run with the GIL disabled\n")
278+
279+
benchmark_names = opts.benchmarks
280+
if benchmark_names:
281+
for name in benchmark_names:
282+
if name not in ALL_BENCHMARKS:
283+
sys.stderr.write(f"Unknown benchmark: {name}\n")
284+
sys.exit(1)
285+
else:
286+
benchmark_names = ALL_BENCHMARKS.keys()
287+
288+
WORK_SCALE = opts.scale
289+
290+
if not opts.baseline_only:
291+
initialize_threads(opts)
292+
293+
do_bench = not opts.baseline_only and not opts.parallel_only
294+
for name in benchmark_names:
295+
func = ALL_BENCHMARKS[name]
296+
if do_bench:
297+
benchmark(func)
298+
continue
299+
300+
if opts.parallel_only:
301+
delta_ns = bench_parallel(func)
302+
else:
303+
delta_ns = bench_one_thread(func)
304+
305+
time_ms = delta_ns / 1_000_000
306+
print(f"{func.__name__:<18} {time_ms:.1f} ms")
307+
308+
309+
if __name__ == "__main__":
310+
import argparse
311+
312+
parser = argparse.ArgumentParser()
313+
parser.add_argument("-t", "--threads", type=int, default=-1,
314+
help="number of threads to use")
315+
parser.add_argument("--scale", type=int, default=100,
316+
help="work scale factor for the benchmark (default=100)")
317+
parser.add_argument("--baseline-only", default=False, action="store_true",
318+
help="only run the baseline benchmarks (single thread)")
319+
parser.add_argument("--parallel-only", default=False, action="store_true",
320+
help="only run the parallel benchmark (many threads)")
321+
parser.add_argument("benchmarks", nargs="*",
322+
help="benchmarks to run")
323+
options = parser.parse_args()
324+
main(options)

0 commit comments

Comments
 (0)