diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp index bae4237194590..9a30ded95b0cd 100644 --- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp @@ -178,11 +178,109 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( false); } +/// Mostly like _v2 but with the builtin assumption that we have less than +/// num_of_records (by default 1024) teams. +int32_t __kmpc_nvptx_teams_reduce_nowait_v3( + IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer, + uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct, + InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, + ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { + // Terminate all threads in non-SPMD mode except for the main thread. + uint32_t ThreadId = mapping::getThreadIdInBlock(); + if (mapping::isGenericMode()) { + if (!mapping::isMainThreadInGenericMode()) + return 0; + ThreadId = 0; + } + + uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt; + + // In non-generic mode all workers participate in the teams reduction. + // In generic mode only the team main participates in the teams + // reduction because the workers are waiting for parallel work. + uint32_t NumThreads = omp_get_num_threads(); + uint32_t TeamId = omp_get_team_num(); + uint32_t NumTeams = omp_get_num_teams(); + static unsigned SHARED(ChunkTeamCount); + + // Block progress for teams greater than the current upper + // limit. We always only allow a number of teams less or equal + // to the number of slots in the buffer. + bool IsMain = (ThreadId == 0); + + if (IsMain) { + lgcpyFct(GlobalBuffer, TeamId, reduce_data); + + // Propagate the memory writes above to the world. + fence::kernel(atomic::release); + + // Increment team counter. + // This counter is incremented by all teams in the current + // BUFFER_SIZE chunk. + ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel, + atomic::MemScopeTy::device); + } + + // Synchronize in SPMD mode as in generic mode all but 1 threads are in the + // state machine. + if (mapping::isSPMDMode()) + synchronize::threadsAligned(atomic::acq_rel); + + // Each thread will have a local struct containing the values to be + // reduced: + // 1. do reduction within each warp. + // 2. do reduction across warps. + // 3. write the final result to the main reduction variable + // by returning 1 in the thread holding the reduction result. + + // Check if this is the very last team. + if (ChunkTeamCount != NumTeams - 1) + return 0; + + // Last team processing. + NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams)); + if (ThreadId >= NumThreads) + return 0; + + // Ensure we see the global memory writes by other teams + fence::kernel(atomic::aquire); + + // Load from buffer and reduce. + glcpyFct(GlobalBuffer, ThreadId, reduce_data); + for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads) + glredFct(GlobalBuffer, i, reduce_data); + + // Reduce across warps to the warp main. + gpu_regular_warp_reduce(reduce_data, shflFct); + + uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads); + uint32_t WarpsNeeded = + (ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize(); + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (mapping::getWarpIdInBlock() == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + + return IsMain; +} + int32_t __kmpc_nvptx_teams_reduce_nowait_v2( IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) { + // The first check is a compile time constant, the second one a runtime check. + // If the first one succeeds we will use the specialized version. + if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 && + state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records && + num_of_records == 1024) || + (omp_get_num_teams() <= num_of_records)) + return __kmpc_nvptx_teams_reduce_nowait_v3( + Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct, + lgcpyFct, lgredFct, glcpyFct, glredFct); + // Terminate all threads in non-SPMD mode except for the master thread. uint32_t ThreadId = mapping::getThreadIdInBlock(); if (mapping::isGenericMode()) {