@@ -38,10 +38,10 @@ def load_remote(
38
38
if i == skip :
39
39
start = read_realtime ()
40
40
tl .store (mm_begin_timestamp_ptr + peer_rank * BLOCK_SIZE + offsets , start , time_stmp_mask )
41
-
41
+
42
42
# iris.load(data + offsets, curr_rank, peer_rank,heap_bases, data_mask)
43
43
from_base = tl .load (heap_bases + curr_rank )
44
- to_base = tl .load (heap_bases + peer_rank )
44
+ to_base = tl .load (heap_bases + peer_rank )
45
45
offset = tl .cast (data + offsets , tl .uint64 ) - from_base
46
46
translated_ptr = tl .cast (tl .cast (to_base , tl .pointer_type (tl .int8 )) + offset , (data + offsets ).dtype )
47
47
result = tl .load (translated_ptr , mask = data_mask , cache_modifier = ".cv" , volatile = True )
@@ -240,15 +240,14 @@ def print_run_settings(
240
240
grid = lambda meta : (1 ,)
241
241
for source_rank in range (num_ranks ):
242
242
for destination_rank in range (num_ranks ):
243
- if cur_rank in [source_rank , destination_rank ]:
244
- peer_for_me = destination_rank if cur_rank == source_rank else source_rank
243
+ if cur_rank == source_rank :
245
244
load_remote [grid ](
246
245
source_buffer ,
247
246
BUFFER_LEN ,
248
247
skip ,
249
248
niter ,
250
249
cur_rank ,
251
- peer_for_me ,
250
+ destination_rank ,
252
251
BLOCK_SIZE ,
253
252
heap_bases ,
254
253
mm_begin_timestamp ,
@@ -258,13 +257,16 @@ def print_run_settings(
258
257
259
258
mm_begin_cpu = mm_begin_timestamp .cpu ().numpy ()
260
259
mm_end_cpu = mm_end_timestamp .cpu ().numpy ()
260
+
261
+ gpu_freq = iris .hip .get_wall_clock_rate (cur_rank )
262
+
261
263
for destination_rank in range (num_ranks ):
262
264
delta = mm_end_cpu [destination_rank , :] - mm_begin_cpu [destination_rank , :]
263
- avg_ns = float (delta .sum () / max (1 , delta .size ) / max (1 , niter ))
264
- local_latency [destination_rank ] = avg_ns
265
+ avg_cc = float (delta .sum () / max (1 , delta .size ) / max (1 , niter ))
266
+ local_latency [destination_rank ] = avg_cc * 1e6 / gpu_freq
265
267
266
268
latency_matrix = mpi_allgather (local_latency .cpu ())
267
269
268
270
if cur_rank == 0 :
269
271
save_results (latency_matrix , args ["output_file" ])
270
- print ("Benchmark complete." )
272
+ print ("Benchmark complete." )
0 commit comments