@@ -130,6 +130,45 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
130130 " Error in hsa_amd_agent_iterate_memory_pools: %s" );
131131}
132132
133+ // / Dispatches an asynchronous memory copy.
134+ // / Enables different SDMA engines for the dispatch in a round-robin fashion.
135+ Error asyncMemCopy (bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
136+ const void *Src, hsa_agent_t SrcAgent, size_t Size,
137+ uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
138+ hsa_signal_t CompletionSignal) {
139+ if (UseMultipleSdmaEngines) {
140+ hsa_status_t S =
141+ hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, Size,
142+ NumDepSignals, DepSignals, CompletionSignal);
143+ return Plugin::check (S, " Error in hsa_amd_memory_async_copy: %s" );
144+ }
145+
146+ // This solution is probably not the best
147+ #if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
148+ HSA_AMD_INTERFACE_VERSION_MINOR >= 2 )
149+ return Plugin::error (" Async copy on selected SDMA requires ROCm 5.7" );
150+ #else
151+ static std::atomic<int > SdmaEngine{1 };
152+
153+ // This atomics solution is probably not the best, but should be sufficient
154+ // for now.
155+ // In a worst case scenario, in which threads read the same value, they will
156+ // dispatch to the same SDMA engine. This may result in sub-optimal
157+ // performance. However, I think the possibility to be fairly low.
158+ int LocalSdmaEngine = SdmaEngine.load (std::memory_order_acquire);
159+ // This call is only avail in ROCm >= 5.7
160+ hsa_status_t S = hsa_amd_memory_async_copy_on_engine (
161+ Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
162+ CompletionSignal, (hsa_amd_sdma_engine_id_t )LocalSdmaEngine,
163+ /* force_copy_on_sdma=*/ true );
164+ // Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
165+ LocalSdmaEngine = (LocalSdmaEngine << 1 ) % 7 ;
166+ SdmaEngine.store (LocalSdmaEngine, std::memory_order_relaxed);
167+
168+ return Plugin::check (S, " Error in hsa_amd_memory_async_copy_on_engine: %s" );
169+ #endif
170+ }
171+
133172} // namespace utils
134173
135174// / Utility class representing generic resource references to AMDGPU resources.
@@ -945,6 +984,9 @@ struct AMDGPUStreamTy {
945984 // / Timeout hint for HSA actively waiting for signal value to change
946985 const uint64_t StreamBusyWaitMicroseconds;
947986
987+ // / Indicate to spread data transfers across all avilable SDMAs
988+ bool UseMultipleSdmaEngines;
989+
948990 // / Return the current number of asychronous operations on the stream.
949991 uint32_t size () const { return NextSlot; }
950992
@@ -1170,15 +1212,15 @@ struct AMDGPUStreamTy {
11701212 InputSignal = nullptr ;
11711213
11721214 // Issue the async memory copy.
1173- hsa_status_t Status;
11741215 if (InputSignal) {
11751216 hsa_signal_t InputSignalRaw = InputSignal->get ();
1176- Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 1 ,
1177- &InputSignalRaw, OutputSignal->get ());
1178- } else
1179- Status = hsa_amd_memory_async_copy (Dst, Agent, Src, Agent, CopySize, 0 ,
1180- nullptr , OutputSignal->get ());
1181- return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1217+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1218+ CopySize, 1 , &InputSignalRaw,
1219+ OutputSignal->get ());
1220+ }
1221+
1222+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
1223+ CopySize, 0 , nullptr , OutputSignal->get ());
11821224 }
11831225
11841226 // / Push an asynchronous memory copy device-to-host involving an unpinned
@@ -1214,21 +1256,19 @@ struct AMDGPUStreamTy {
12141256
12151257 // Issue the first step: device to host transfer. Avoid defining the input
12161258 // dependency if already satisfied.
1217- hsa_status_t Status;
12181259 if (InputSignal) {
12191260 hsa_signal_t InputSignalRaw = InputSignal->get ();
1220- Status =
1221- hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 1 ,
1222- &InputSignalRaw, OutputSignals[0 ]->get ());
1261+ if (auto Err = utils::asyncMemCopy (
1262+ UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1 ,
1263+ &InputSignalRaw, OutputSignals[0 ]->get ()))
1264+ return Err;
12231265 } else {
1224- Status = hsa_amd_memory_async_copy (Inter, Agent, Src, Agent, CopySize, 0 ,
1225- nullptr , OutputSignals[0 ]->get ());
1266+ if (auto Err = utils::asyncMemCopy (UseMultipleSdmaEngines, Inter, Agent,
1267+ Src, Agent, CopySize, 0 , nullptr ,
1268+ OutputSignals[0 ]->get ()))
1269+ return Err;
12261270 }
12271271
1228- if (auto Err =
1229- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
1230- return Err;
1231-
12321272 // Consume another stream slot and compute dependencies.
12331273 std::tie (Curr, InputSignal) = consume (OutputSignals[1 ]);
12341274 assert (InputSignal && " Invalid input signal" );
@@ -1242,7 +1282,7 @@ struct AMDGPUStreamTy {
12421282 std::atomic_thread_fence (std::memory_order_release);
12431283
12441284 // Issue the second step: host to host transfer.
1245- Status = hsa_amd_signal_async_handler (
1285+ hsa_status_t Status = hsa_amd_signal_async_handler (
12461286 InputSignal->get (), HSA_SIGNAL_CONDITION_EQ, 0 , asyncActionCallback,
12471287 (void *)&Slots[Curr]);
12481288
@@ -1318,16 +1358,14 @@ struct AMDGPUStreamTy {
13181358
13191359 // Issue the second step: host to device transfer. Avoid defining the input
13201360 // dependency if already satisfied.
1321- hsa_status_t Status;
13221361 if (InputSignal && InputSignal->load ()) {
13231362 hsa_signal_t InputSignalRaw = InputSignal->get ();
1324- Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 1 ,
1325- &InputSignalRaw, OutputSignal->get ());
1326- } else
1327- Status = hsa_amd_memory_async_copy (Dst, Agent, Inter, Agent, CopySize, 0 ,
1328- nullptr , OutputSignal->get ());
1329-
1330- return Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" );
1363+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter,
1364+ Agent, CopySize, 1 , &InputSignalRaw,
1365+ OutputSignal->get ());
1366+ }
1367+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
1368+ CopySize, 0 , nullptr , OutputSignal->get ());
13311369 }
13321370
13331371 // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
@@ -1353,17 +1391,15 @@ struct AMDGPUStreamTy {
13531391 // allocated by this runtime or the caller made the appropriate
13541392 // access calls.
13551393
1356- hsa_status_t Status;
13571394 if (InputSignal && InputSignal->load ()) {
13581395 hsa_signal_t InputSignalRaw = InputSignal->get ();
1359- Status =
1360- hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize, 1 ,
1361- &InputSignalRaw, OutputSignal->get ());
1362- } else
1363- Status = hsa_amd_memory_async_copy (Dst, DstAgent, Src, SrcAgent, CopySize,
1364- 0 , nullptr , OutputSignal->get ());
1365-
1366- return Plugin::check (Status, " Error in D2D hsa_amd_memory_async_copy: %s" );
1396+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1397+ SrcAgent, CopySize, 1 , &InputSignalRaw,
1398+ OutputSignal->get ());
1399+ }
1400+ return utils::asyncMemCopy (UseMultipleSdmaEngines, Dst, DstAgent, Src,
1401+ SrcAgent, CopySize, 0 , nullptr ,
1402+ OutputSignal->get ());
13671403 }
13681404
13691405 // / Synchronize with the stream. The current thread waits until all operations
@@ -1788,6 +1824,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
17881824 OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
17891825 64 ),
17901826 OMPX_StreamBusyWait (" LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT" , 2000000 ),
1827+ OMPX_UseMultipleSdmaEngines (
1828+ " LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES" , false ),
17911829 AMDGPUStreamManager (*this , Agent), AMDGPUEventManager(*this ),
17921830 AMDGPUSignalManager (*this ), Agent(Agent), HostDevice(HostDevice) {}
17931831
@@ -2206,10 +2244,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22062244 if (auto Err = Signal.init ())
22072245 return Err;
22082246
2209- Status = hsa_amd_memory_async_copy (TgtPtr, Agent, PinnedPtr, Agent, Size,
2210- 0 , nullptr , Signal.get ());
2211- if (auto Err =
2212- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
2247+ if (auto Err = utils::asyncMemCopy (useMultipleSdmaEngines (), TgtPtr,
2248+ Agent, PinnedPtr, Agent, Size, 0 ,
2249+ nullptr , Signal.get ()))
22132250 return Err;
22142251
22152252 if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2267,10 +2304,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
22672304 if (auto Err = Signal.init ())
22682305 return Err;
22692306
2270- Status = hsa_amd_memory_async_copy (PinnedPtr, Agent, TgtPtr, Agent, Size,
2271- 0 , nullptr , Signal.get ());
2272- if (auto Err =
2273- Plugin::check (Status, " Error in hsa_amd_memory_async_copy: %s" ))
2307+ if (auto Err = utils::asyncMemCopy (useMultipleSdmaEngines (), PinnedPtr,
2308+ Agent, TgtPtr, Agent, Size, 0 , nullptr ,
2309+ Signal.get ()))
22742310 return Err;
22752311
22762312 if (auto Err = Signal.wait (getStreamBusyWaitMicroseconds ()))
@@ -2633,6 +2669,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26332669 });
26342670 }
26352671
2672+ bool useMultipleSdmaEngines () const { return OMPX_UseMultipleSdmaEngines; }
2673+
26362674private:
26372675 using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
26382676 using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
@@ -2702,6 +2740,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
27022740 // / are microseconds.
27032741 UInt32Envar OMPX_StreamBusyWait;
27042742
2743+ // / Use ROCm 5.7 interface for multiple SDMA engines
2744+ BoolEnvar OMPX_UseMultipleSdmaEngines;
2745+
27052746 // / Stream manager for AMDGPU streams.
27062747 AMDGPUStreamManagerTy AMDGPUStreamManager;
27072748
@@ -2803,7 +2844,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
28032844 SignalManager (Device.getSignalManager()), Device(Device),
28042845 // Initialize the std::deque with some empty positions.
28052846 Slots(32 ), NextSlot(0 ), SyncCycle(0 ), RPCServer(nullptr ),
2806- StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
2847+ StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
2848+ UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
28072849
28082850// / Class implementing the AMDGPU-specific functionalities of the global
28092851// / handler.
0 commit comments