Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 30 additions & 21 deletions SYCL/ESIMD/BitonicSortK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
cl::sycl::range<1> SortLocalRange{1};

double total_time = 0;
{
try {
buffer<uint32_t, 1> bufi(pInputs, range<1>(size));
buffer<uint32_t, 1> bufo(pOutputs, range<1>(size));
// enqueue sort265 kernel
Expand All @@ -616,6 +616,9 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
});
e.wait();
total_time += report_time("kernel time", e, e);
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

// Each HW thread swap two 256-element chunks. Hence, we only need
Expand All @@ -631,28 +634,34 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
// this loop is for stage 8 to stage LOG2_ELEMENTS.
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
int k = 0;
for (int i = 8; i < LOG2_ELEMENTS; i++) {
// each step halves the stride distance of its prior step.
// 1<<j is the stride distance that the invoked step will handle.
// The recursive steps continue until stride distance 1 is complete.
// For stride distance less than 1<<8, no global synchronization
// is needed, i.e., all work can be done locally within HW threads.
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
// locally.
for (int j = i; j >= 8; j--) {
buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
auto acc = buf.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
[=](id<1> tid) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
cmk_bitonic_merge(acc, j, i, tid);
});
});
k++;
try {
for (int i = 8; i < LOG2_ELEMENTS; i++) {
// each step halves the stride distance of its prior step.
// 1<<j is the stride distance that the invoked step will handle.
// The recursive steps continue until stride distance 1 is complete.
// For stride distance less than 1<<8, no global synchronization
// is needed, i.e., all work can be done locally within HW threads.
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
// locally.
for (int j = i; j >= 8; j--) {
buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
auto acc = buf.get_access<access::mode::read_write>(cgh);
cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
[=](id<1> tid) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
cmk_bitonic_merge(acc, j, i, tid);
});
});
k++;
}
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

mergeEvent[k - 1].wait();
total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);

Expand Down
68 changes: 40 additions & 28 deletions SYCL/ESIMD/BitonicSortKv2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -523,15 +523,20 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {

// enqueue sort265 kernel
double total_time = 0;
auto e = pQueue_->submit([&](handler &cgh) {
cgh.parallel_for<class Sort256>(
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
cmk_bitonic_sort_256(pInputs, pOutputs, i);
});
});
e.wait();
total_time += report_time("kernel time", e, e);
try {
auto e = pQueue_->submit([&](handler &cgh) {
cgh.parallel_for<class Sort256>(
SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
cmk_bitonic_sort_256(pInputs, pOutputs, i);
});
});
e.wait();
total_time += report_time("kernel time", e, e);
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

// Each HW thread swap two 256-element chunks. Hence, we only need
// to launch size/ (base_sort_size*2) HW threads
Expand All @@ -546,27 +551,34 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
// this loop is for stage 8 to stage LOG2_ELEMENTS.
event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
int k = 0;
for (int i = 8; i < LOG2_ELEMENTS; i++) {
// each step halves the stride distance of its prior step.
// 1<<j is the stride distance that the invoked step will handle.
// The recursive steps continue until stride distance 1 is complete.
// For stride distance less than 1<<8, no global synchronization
// is needed, i.e., all work can be done locally within HW threads.
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
// locally.
for (int j = i; j >= 8; j--) {
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
[=](id<1> tid) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
cmk_bitonic_merge(pOutputs, j, i, tid);
});
});
// mergeEvent[k].wait();
k++;
try {
for (int i = 8; i < LOG2_ELEMENTS; i++) {
// each step halves the stride distance of its prior step.
// 1<<j is the stride distance that the invoked step will handle.
// The recursive steps continue until stride distance 1 is complete.
// For stride distance less than 1<<8, no global synchronization
// is needed, i.e., all work can be done locally within HW threads.
// Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
// compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
// locally.
for (int j = i; j >= 8; j--) {
mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
[=](id<1> tid) SYCL_ESIMD_KERNEL {
using namespace sycl::INTEL::gpu;
cmk_bitonic_merge(pOutputs, j, i,
tid);
});
});
// mergeEvent[k].wait();
k++;
}
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
return e.get_cl_code();
}

mergeEvent[k - 1].wait();
total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);

Expand Down
134 changes: 70 additions & 64 deletions SYCL/ESIMD/PrefixSum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -298,76 +298,82 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
void hierarchical_prefix(queue &q, unsigned *buf, unsigned elem_stride,
unsigned thrd_stride, unsigned n_entries,
unsigned entry_per_th) {
if (n_entries <= REMAINING_ENTRIES) {
std::cout << "... n_entries: " << n_entries
try {
if (n_entries <= REMAINING_ENTRIES) {
std::cout << "... n_entries: " << n_entries
<< " elem_stide: " << elem_stride
<< " thread_stride: " << thrd_stride
<< " entry per thread: " << entry_per_th << std::endl;
// one single thread
q.submit([&](handler &cgh) {
cgh.parallel_for<class Accum_final>(
range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries);
});
});
return;
}

std::cout << "*** n_entries: " << n_entries
<< " elem_stide: " << elem_stride
<< " thread_stride: " << thrd_stride
<< " entry per thread: " << entry_per_th << std::endl;
// one single thread
q.submit([&](handler &cgh) {
cgh.parallel_for<class Accum_final>(
range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries);
});
});
return;
}

std::cout << "*** n_entries: " << n_entries << " elem_stide: " << elem_stride
<< " thread_stride: " << thrd_stride
<< " entry per thread: " << entry_per_th << std::endl;

if (entry_per_th == PREFIX_ENTRIES) {
q.submit([&](handler &cgh) {
cgh.parallel_for<class Accum_iterative1>(
range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
PREFIX_ENTRIES);
});
});
} else {
q.submit([&](handler &cgh) {
cgh.parallel_for<class Accum_iterative2>(
range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
PREFIX_ENTRIES_LOW);
});
});
}

// if number of remaining entries <= 4K , each thread accumulates smaller
// number of entries to keep EUs saturated
if (n_entries / entry_per_th > 4096)
hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES,
n_entries / entry_per_th, PREFIX_ENTRIES);
else
hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES_LOW,
n_entries / entry_per_th, PREFIX_ENTRIES_LOW);

std::cout << "=== n_entries: " << n_entries << " elem_stide: " << elem_stride
<< " thread_stride: " << thrd_stride
<< " entry per thread: " << entry_per_th << std::endl;

if (entry_per_th == PREFIX_ENTRIES) {
q.submit([&](handler &cgh) {
cgh.parallel_for<class Prefix_iterative1>(
range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
if (entry_per_th == PREFIX_ENTRIES) {
q.submit([&](handler &cgh) {
cgh.parallel_for<class Accum_iterative1>(
range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
PREFIX_ENTRIES);
});
});
} else {
q.submit([&](handler &cgh) {
cgh.parallel_for<class Prefix_iterative2>(
range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
});
});
} else {
q.submit([&](handler &cgh) {
cgh.parallel_for<class Accum_iterative2>(
range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
PREFIX_ENTRIES_LOW);
});
});
});
});
}

// if number of remaining entries <= 4K , each thread accumulates smaller
// number of entries to keep EUs saturated
if (n_entries / entry_per_th > 4096)
hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES,
n_entries / entry_per_th, PREFIX_ENTRIES);
else
hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES_LOW,
n_entries / entry_per_th, PREFIX_ENTRIES_LOW);

std::cout << "=== n_entries: " << n_entries
<< " elem_stide: " << elem_stride
<< " thread_stride: " << thrd_stride
<< " entry per thread: " << entry_per_th << std::endl;

if (entry_per_th == PREFIX_ENTRIES) {
q.submit([&](handler &cgh) {
cgh.parallel_for<class Prefix_iterative1>(
range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
PREFIX_ENTRIES);
});
});
} else {
q.submit([&](handler &cgh) {
cgh.parallel_for<class Prefix_iterative2>(
range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
PREFIX_ENTRIES_LOW);
});
});
}
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
}
}

Expand Down
22 changes: 14 additions & 8 deletions SYCL/ESIMD/Prefix_Local_sum1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,14 +144,20 @@ int main(int argc, char *argv[]) {

// compute local sum for every chunk of PREFIX_ENTRIES
cl::sycl::range<2> GlobalRange{size / PREFIX_ENTRIES, 1};
auto e0 = q.submit([&](handler &cgh) {
cgh.parallel_for<class Sum_tuple>(
GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_sum_tuple_count(pInputs, it.get_id(0));
});
});

e0.wait();
try {
auto e0 = q.submit([&](handler &cgh) {
cgh.parallel_for<class Sum_tuple>(
GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_sum_tuple_count(pInputs, it.get_id(0));
});
});
e0.wait();
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
free(pInputs, ctxt);
free(pExpectOutputs);
return e.get_cl_code();
}

bool pass = memcmp(pInputs, pExpectOutputs,
size * TUPLE_SZ * sizeof(unsigned int)) == 0;
Expand Down
24 changes: 15 additions & 9 deletions SYCL/ESIMD/Prefix_Local_sum2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,21 @@ int main(int argc, char *argv[]) {
malloc(size * TUPLE_SZ * sizeof(unsigned int)));
compute_local_prefixsum(pInputs, pExpectOutputs, size);

auto e1 = q.submit([&](handler &cgh) {
cgh.parallel_for<class Accum_iterative>(
range<2>{size / PREFIX_ENTRIES, 1} * LocalRange,
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_acum_iterative(pInputs, it.get_id(0), 1, PREFIX_ENTRIES);
});
});

e1.wait();
try {
auto e1 = q.submit([&](handler &cgh) {
cgh.parallel_for<class Accum_iterative>(
range<2>{size / PREFIX_ENTRIES, 1} * LocalRange,
[=](item<2> it) SYCL_ESIMD_KERNEL {
cmk_acum_iterative(pInputs, it.get_id(0), 1, PREFIX_ENTRIES);
});
});
e1.wait();
} catch (cl::sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
free(pInputs, ctxt);
free(pExpectOutputs);
return e.get_cl_code();
}

bool pass = memcmp(pInputs, pExpectOutputs,
size * TUPLE_SZ * sizeof(unsigned int)) == 0;
Expand Down
Loading