Skip to content

Commit b301385

Browse files
committed
Fix latency time measurement
1 parent 56ad603 commit b301385

File tree

1 file changed

+10
-8
lines changed

1 file changed

+10
-8
lines changed

benchmarks/bench_load_latency.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ def load_remote(
3838
if i == skip:
3939
start = read_realtime()
4040
tl.store(mm_begin_timestamp_ptr + peer_rank * BLOCK_SIZE + offsets, start, time_stmp_mask)
41-
41+
4242
# iris.load(data + offsets, curr_rank, peer_rank,heap_bases, data_mask)
4343
from_base = tl.load(heap_bases + curr_rank)
44-
to_base = tl.load(heap_bases + peer_rank)
44+
to_base = tl.load(heap_bases + peer_rank)
4545
offset = tl.cast(data + offsets, tl.uint64) - from_base
4646
translated_ptr = tl.cast(tl.cast(to_base, tl.pointer_type(tl.int8)) + offset, (data + offsets).dtype)
4747
result = tl.load(translated_ptr, mask=data_mask, cache_modifier=".cv", volatile=True)
@@ -240,15 +240,14 @@ def print_run_settings(
240240
grid = lambda meta: (1,)
241241
for source_rank in range(num_ranks):
242242
for destination_rank in range(num_ranks):
243-
if cur_rank in [source_rank, destination_rank]:
244-
peer_for_me = destination_rank if cur_rank == source_rank else source_rank
243+
if cur_rank == source_rank:
245244
load_remote[grid](
246245
source_buffer,
247246
BUFFER_LEN,
248247
skip,
249248
niter,
250249
cur_rank,
251-
peer_for_me,
250+
destination_rank,
252251
BLOCK_SIZE,
253252
heap_bases,
254253
mm_begin_timestamp,
@@ -258,13 +257,16 @@ def print_run_settings(
258257

259258
mm_begin_cpu = mm_begin_timestamp.cpu().numpy()
260259
mm_end_cpu = mm_end_timestamp.cpu().numpy()
260+
261+
gpu_freq = iris.hip.get_wall_clock_rate(cur_rank)
262+
261263
for destination_rank in range(num_ranks):
262264
delta = mm_end_cpu[destination_rank, :] - mm_begin_cpu[destination_rank, :]
263-
avg_ns = float(delta.sum() / max(1, delta.size) / max(1, niter))
264-
local_latency[destination_rank] = avg_ns
265+
avg_cc = float(delta.sum() / max(1, delta.size) / max(1, niter))
266+
local_latency[destination_rank] = avg_cc * 1e6 / gpu_freq
265267

266268
latency_matrix = mpi_allgather(local_latency.cpu())
267269

268270
if cur_rank == 0:
269271
save_results(latency_matrix, args["output_file"])
270-
print("Benchmark complete.")
272+
print("Benchmark complete.")

0 commit comments

Comments
 (0)