Skip to content

[SYCL][CUDA] Select only NVPTX64 device binaries #1223

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 3, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,9 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4;
#define PI_DEVICE_BINARY_TARGET_SPIRV64_GEN "spir64_gen"
#define PI_DEVICE_BINARY_TARGET_SPIRV64_FPGA "spir64_fpga"

/// PTX 64-bit image <-> "nvptx64", 64-bit NVIDIA PTX device
#define PI_DEVICE_BINARY_TARGET_NVPTX64 "nvptx64"

/// This struct is a record of the device binary information. If the Kind field
/// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec field
/// can still be specific and denote e.g. FPGA target.
Expand Down
23 changes: 17 additions & 6 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -673,10 +673,10 @@ pi_result cuda_piDevicePartition(
return {};
}

pi_result cuda_piextDeviceSelectBinary(
pi_device device, // TODO: does this need to be context?
pi_device_binary *binaries, pi_uint32 num_binaries,
pi_device_binary *selected_binary) {
pi_result cuda_piextDeviceSelectBinary(pi_device device,
pi_device_binary *binaries,
pi_uint32 num_binaries,
pi_device_binary *selected_binary) {
if (!binaries) {
cl::sycl::detail::pi::die("No list of device images provided");
}
Expand All @@ -686,8 +686,19 @@ pi_result cuda_piextDeviceSelectBinary(
if (!selected_binary) {
cl::sycl::detail::pi::die("No storage for device binary provided");
}
*selected_binary = binaries[0];
return PI_SUCCESS;

// Look for an image for the NVPTX64 target, and return the first one that is
// found
for (pi_uint32 i = 0; i < num_binaries; i++) {
if (strcmp(binaries[i]->DeviceTargetSpec,
PI_DEVICE_BINARY_TARGET_NVPTX64) == 0) {
*selected_binary = binaries[i];
return PI_SUCCESS;
}
}

// No image can be loaded for the given device
return PI_INVALID_BINARY;
}

pi_result cuda_piextGetDeviceFunctionPointer(pi_device device,
Expand Down
70 changes: 70 additions & 0 deletions sycl/test/plugins/sycl-targets-order.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// RUN: %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown-sycldevice,nvptx64-unknown-unknown-sycldevice %s -o %t-spir64-nvptx64.out
// RUN: env SYCL_BE=PI_OPENCL %t-spir64-nvptx64.out
// RUN: env SYCL_BE=PI_CUDA %t-spir64-nvptx64.out
// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-unknown-unknown-sycldevice,spir64-unknown-unknown-sycldevice %s -o %t-nvptx64-spir64.out
// RUN: env SYCL_BE=PI_OPENCL %t-nvptx64-spir64.out
// RUN: env SYCL_BE=PI_CUDA %t-nvptx64-spir64.out

// REQUIRES: opencl, cuda

//==------- sycl-targets-order.cpp - SYCL -fsycl-targets order test --------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <CL/sycl.hpp>
#include <iostream>

int main(int argc, char **argv) {

// select the default SYCL device
cl::sycl::device device{cl::sycl::default_selector{}};
std::cout << "Running on SYCL device " << device.get_info<cl::sycl::info::device::name>()
<< ", driver version " << device.get_info<cl::sycl::info::device::driver_version>()
<< std::endl;

// create a queue
cl::sycl::queue queue{device};

// create a buffer of 4 ints to be used inside the kernel code
cl::sycl::buffer<unsigned int, 1> buffer(4);

// size of the index space for the kernel
cl::sycl::range<1> NumOfWorkItems{buffer.get_count()};

// submit a command group(work) to the queue
queue.submit([&](cl::sycl::handler &cgh) {
// get write only access to the buffer on a device
auto accessor = buffer.get_access<cl::sycl::access::mode::write>(cgh);
// executing the kernel
cgh.parallel_for<class FillBuffer>(
NumOfWorkItems, [=](cl::sycl::id<1> WIid) {
// fill the buffer with indexes
accessor[WIid] = WIid.get(0);
});
});

// get read-only access to the buffer on the host
// introduce an implicit barrier waiting for queue to complete the work
const auto host_accessor = buffer.get_access<cl::sycl::access::mode::read>();

// check the results
bool mismatch = false;
for (unsigned int i = 0; i < buffer.get_count(); ++i) {
if (host_accessor[i] != i) {
std::cout << "The result is incorrect for element: " << i
<< " , expected: " << i << " , got: " << host_accessor[i]
<< std::endl;
mismatch = true;
}
}

if (not mismatch) {
std::cout << "The results are correct!" << std::endl;
}

return mismatch;
}