Skip to content

[OpenMP] Team reduction work specialization #70766

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 2, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions openmp/libomptarget/DeviceRTL/src/Reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,109 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
false);
}

/// Mostly like _v2 but with the builtin assumption that we have less than
/// num_of_records (by default 1024) teams.
int32_t __kmpc_nvptx_teams_reduce_nowait_v3(
IdentTy *Loc, int32_t TId, void *__restrict__ GlobalBuffer,
uint32_t num_of_records, void *reduce_data, ShuffleReductFnTy shflFct,
InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct,
ListGlobalFnTy glcpyFct, ListGlobalFnTy glredFct) {
// Terminate all threads in non-SPMD mode except for the main thread.
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (mapping::isGenericMode()) {
if (!mapping::isMainThreadInGenericMode())
return 0;
ThreadId = 0;
}

uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;

// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team main participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads = omp_get_num_threads();
uint32_t TeamId = omp_get_team_num();
uint32_t NumTeams = omp_get_num_teams();
static unsigned SHARED(ChunkTeamCount);

// Block progress for teams greater than the current upper
// limit. We always only allow a number of teams less or equal
// to the number of slots in the buffer.
bool IsMain = (ThreadId == 0);

if (IsMain) {
lgcpyFct(GlobalBuffer, TeamId, reduce_data);

// Propagate the memory writes above to the world.
fence::kernel(atomic::release);

// Increment team counter.
// This counter is incremented by all teams in the current
// BUFFER_SIZE chunk.
ChunkTeamCount = atomic::inc(&Cnt, NumTeams, atomic::acq_rel,
atomic::MemScopeTy::device);
}

// Synchronize in SPMD mode as in generic mode all but 1 threads are in the
// state machine.
if (mapping::isSPMDMode())
synchronize::threadsAligned(atomic::acq_rel);

// Each thread will have a local struct containing the values to be
// reduced:
// 1. do reduction within each warp.
// 2. do reduction across warps.
// 3. write the final result to the main reduction variable
// by returning 1 in the thread holding the reduction result.

// Check if this is the very last team.
if (ChunkTeamCount != NumTeams - 1)
return 0;

// Last team processing.
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumTeams));
if (ThreadId >= NumThreads)
return 0;

// Ensure we see the global memory writes by other teams
fence::kernel(atomic::aquire);

// Load from buffer and reduce.
glcpyFct(GlobalBuffer, ThreadId, reduce_data);
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
glredFct(GlobalBuffer, i, reduce_data);

// Reduce across warps to the warp main.
gpu_regular_warp_reduce(reduce_data, shflFct);

uint32_t ActiveThreads = kmpcMin(NumTeams, NumThreads);
uint32_t WarpsNeeded =
(ActiveThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);

if (mapping::getWarpIdInBlock() == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);

return IsMain;
}

int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
ListGlobalFnTy glredFct) {
// The first check is a compile time constant, the second one a runtime check.
// If the first one succeeds we will use the specialized version.
if ((state::getKernelEnvironment().Configuration.MaxTeams >= 0 &&
state::getKernelEnvironment().Configuration.MaxTeams <= num_of_records &&
num_of_records == 1024) ||
(omp_get_num_teams() <= num_of_records))
return __kmpc_nvptx_teams_reduce_nowait_v3(
Loc, TId, GlobalBuffer, num_of_records, reduce_data, shflFct, cpyFct,
lgcpyFct, lgredFct, glcpyFct, glredFct);

// Terminate all threads in non-SPMD mode except for the master thread.
uint32_t ThreadId = mapping::getThreadIdInBlock();
if (mapping::isGenericMode()) {
Expand Down