diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 4fd8bcf75435f..a4cd8b855502d 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1643,21 +1643,22 @@ pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, try { ScopedContext active(context); CUdeviceptr ptr; - _pi_mem::alloc_mode allocMode = _pi_mem::alloc_mode::classic; + _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = + _pi_mem::mem_::buffer_mem_::alloc_mode::classic; if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) { retErr = PI_CHECK_ERROR( cuMemHostRegister(host_ptr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0)); - allocMode = _pi_mem::alloc_mode::use_host_ptr; + allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr; } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) { retErr = PI_CHECK_ERROR(cuMemAllocHost(&host_ptr, size)); retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0)); - allocMode = _pi_mem::alloc_mode::alloc_host_ptr; + allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr; } else { retErr = PI_CHECK_ERROR(cuMemAlloc(&ptr, size)); if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) { - allocMode = _pi_mem::alloc_mode::copy_in; + allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in; } } @@ -1713,21 +1714,31 @@ pi_result cuda_piMemRelease(pi_mem memObj) { // make sure memObj is released in case PI_CHECK_ERROR throws std::unique_ptr<_pi_mem> uniqueMemObj(memObj); - if (!memObj->is_sub_buffer()) { + if (memObj->is_sub_buffer()) { + return PI_SUCCESS; + } - ScopedContext active(uniqueMemObj->get_context()); + ScopedContext active(uniqueMemObj->get_context()); - switch (uniqueMemObj->allocMode_) { - case _pi_mem::alloc_mode::copy_in: - case _pi_mem::alloc_mode::classic: - ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->ptr_)); + if (memObj->mem_type_ == _pi_mem::mem_type::buffer) { + switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) { + case _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in: + case _pi_mem::mem_::buffer_mem_::alloc_mode::classic: + ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_)); break; - case _pi_mem::alloc_mode::use_host_ptr: - ret = PI_CHECK_ERROR(cuMemHostUnregister(uniqueMemObj->hostPtr_)); + case _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr: + ret = PI_CHECK_ERROR( + cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); break; - case _pi_mem::alloc_mode::alloc_host_ptr: - ret = PI_CHECK_ERROR(cuMemFreeHost(uniqueMemObj->hostPtr_)); + case _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr: + ret = PI_CHECK_ERROR( + cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_)); }; + } else if (memObj->mem_type_ == _pi_mem::mem_type::surface) { + ret = PI_CHECK_ERROR( + cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface())); + ret = PI_CHECK_ERROR( + cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array())); } } catch (pi_result err) { @@ -1777,19 +1788,22 @@ pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags, assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) && "Overflow"); assert(((bufferRegion.origin + bufferRegion.size) <= - parent_buffer->get_size()) && + parent_buffer->mem_.buffer_mem_.get_size()) && "PI_INVALID_BUFFER_SIZE"); // Retained indirectly due to retaining parent buffer below. pi_context context = parent_buffer->context_; - _pi_mem::alloc_mode allocMode = _pi_mem::alloc_mode::classic; + _pi_mem::mem_::buffer_mem_::alloc_mode allocMode = + _pi_mem::mem_::buffer_mem_::alloc_mode::classic; - assert(parent_buffer->ptr_ != _pi_mem::native_type{0}); - _pi_mem::native_type ptr = parent_buffer->ptr_ + bufferRegion.origin; + assert(parent_buffer->mem_.buffer_mem_.ptr_ != + _pi_mem::mem_::buffer_mem_::native_type{0}); + _pi_mem::mem_::buffer_mem_::native_type ptr = + parent_buffer->mem_.buffer_mem_.ptr_ + bufferRegion.origin; void *hostPtr = nullptr; - if (parent_buffer->hostPtr_) { - hostPtr = - static_cast(parent_buffer->hostPtr_) + bufferRegion.origin; + if (parent_buffer->mem_.buffer_mem_.hostPtr_) { + hostPtr = static_cast(parent_buffer->mem_.buffer_mem_.hostPtr_) + + bufferRegion.origin; } ReleaseGuard releaseGuard(parent_buffer); @@ -1828,7 +1842,7 @@ pi_result cuda_piMemGetInfo(pi_mem memObj, cl_mem_info queriedInfo, /// \return PI_SUCCESS pi_result cuda_piextMemGetNativeHandle(pi_mem mem, pi_native_handle *nativeHandle) { - *nativeHandle = static_cast(mem->get()); + *nativeHandle = static_cast(mem->mem_.buffer_mem_.get()); return PI_SUCCESS; } @@ -2020,7 +2034,7 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; CUstream cuStream = command_queue->get(); - CUdeviceptr devPtr = buffer->get(); + CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -2066,7 +2080,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; CUstream cuStream = command_queue->get(); - CUdeviceptr devPtr = buffer->get(); + CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -2202,8 +2216,25 @@ pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index, pi_result retErr = PI_SUCCESS; try { - CUdeviceptr cuPtr = (*arg_value)->get(); - kernel->set_kernel_arg(arg_index, sizeof(CUdeviceptr), (void *)&cuPtr); + pi_mem arg_mem = *arg_value; + if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) { + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + PI_CHECK_ERROR(cuArray3DGetDescriptor( + &arrayDesc, arg_mem->mem_.surface_mem_.get_array())); + if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 && + arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 && + arrayDesc.Format != CU_AD_FORMAT_HALF && + arrayDesc.Format != CU_AD_FORMAT_FLOAT) { + cl::sycl::detail::pi::die( + "PI CUDA kernels only support images with channel types int32, " + "uint32, float, and half."); + } + CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface(); + kernel->set_kernel_arg(arg_index, sizeof(cuSurf), (void *)&cuSurf); + } else { + CUdeviceptr cuPtr = arg_mem->mem_.buffer_mem_.get(); + kernel->set_kernel_arg(arg_index, sizeof(CUdeviceptr), (void *)&cuPtr); + } } catch (pi_result err) { retErr = err; } @@ -2326,8 +2357,155 @@ pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags, const pi_image_format *image_format, const pi_image_desc *image_desc, void *host_ptr, pi_mem *ret_mem) { - cl::sycl::detail::pi::die("cuda_piMemImageCreate not implemented"); - return {}; + // Need input memory object + assert(ret_mem != nullptr); + const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) || + ((flags & PI_MEM_FLAGS_HOST_PTR_USE)); + pi_result retErr = PI_SUCCESS; + + // We only support RBGA channel order + // TODO: check SYCL CTS and spec. May also have to support BGRA + if (image_format->image_channel_order != + pi_image_channel_order::PI_IMAGE_CHANNEL_ORDER_RGBA) { + cl::sycl::detail::pi::die( + "cuda_piMemImageCreate only supports RGBA channel order"); + } + + // We have to use cuArray3DCreate, which has some caveats. The height and + // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives + // a minimum value of 1, so we need to convert the answer. + CUDA_ARRAY3D_DESCRIPTOR array_desc; + array_desc.NumChannels = 4; // Only support 4 channel image + array_desc.Flags = 0; // No flags required + array_desc.Width = image_desc->image_width; + if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) { + array_desc.Height = 0; + array_desc.Depth = 0; + } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) { + array_desc.Height = image_desc->image_height; + array_desc.Depth = 0; + } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) { + array_desc.Height = image_desc->image_height; + array_desc.Depth = image_desc->image_depth; + } + + // We need to get this now in bytes for calculating the total image size later + size_t pixel_type_size_bytes; + + switch (image_format->image_channel_data_type) { + case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8: + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; + pixel_type_size_bytes = 1; + break; + case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8: + array_desc.Format = CU_AD_FORMAT_SIGNED_INT8; + pixel_type_size_bytes = 1; + break; + case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16: + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: + array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; + pixel_type_size_bytes = 2; + break; + case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16: + array_desc.Format = CU_AD_FORMAT_SIGNED_INT16; + pixel_type_size_bytes = 2; + break; + case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT: + array_desc.Format = CU_AD_FORMAT_HALF; + pixel_type_size_bytes = 2; + break; + case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32; + pixel_type_size_bytes = 4; + break; + case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32: + array_desc.Format = CU_AD_FORMAT_SIGNED_INT32; + pixel_type_size_bytes = 4; + break; + case PI_IMAGE_CHANNEL_TYPE_FLOAT: + array_desc.Format = CU_AD_FORMAT_FLOAT; + pixel_type_size_bytes = 4; + break; + default: + cl::sycl::detail::pi::die( + "cuda_piMemImageCreate given unsupported image_channel_data_type"); + } + + // When a dimension isn't used image_desc has the size set to 1 + size_t pixel_size_bytes = + pixel_type_size_bytes * 4; // 4 is the only number of channels we support + size_t image_size_bytes = pixel_size_bytes * image_desc->image_width * + image_desc->image_height * image_desc->image_depth; + + ScopedContext active(context); + CUarray image_array; + retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc)); + + try { + if (performInitialCopy) { + // We have to use a different copy function for each image dimensionality + if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) { + retErr = PI_CHECK_ERROR( + cuMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes)); + } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D cpy_desc; + memset(&cpy_desc, 0, sizeof(cpy_desc)); + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + cpy_desc.srcHost = host_ptr; + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + cpy_desc.dstArray = image_array; + cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width; + cpy_desc.Height = image_desc->image_height; + retErr = PI_CHECK_ERROR(cuMemcpy2D(&cpy_desc)); + } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D cpy_desc; + memset(&cpy_desc, 0, sizeof(cpy_desc)); + cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + cpy_desc.srcHost = host_ptr; + cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + cpy_desc.dstArray = image_array; + cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width; + cpy_desc.Height = image_desc->image_height; + cpy_desc.Depth = image_desc->image_depth; + retErr = PI_CHECK_ERROR(cuMemcpy3D(&cpy_desc)); + } + } + + // CUDA_RESOURCE_DESC is a union of different structs, shown here + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html + // We need to fill it as described here to use it for a surface or texture + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html + // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and + // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array + // handle. + // CUDA_RESOURCE_DESC::flags must be set to zero + + CUDA_RESOURCE_DESC image_res_desc; + image_res_desc.res.array.hArray = image_array; + image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY; + image_res_desc.flags = 0; + + CUsurfObject surface; + retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc)); + + auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{ + context, image_array, surface, image_desc->image_type, host_ptr}); + + if (piMemObj == nullptr) { + return PI_OUT_OF_HOST_MEMORY; + } + + *ret_mem = piMemObj.release(); + } catch (pi_result err) { + cuArrayDestroy(image_array); + return err; + } catch (...) { + cuArrayDestroy(image_array); + return PI_ERROR_UNKNOWN; + } + + return retErr; } /// \TODO Not implemented @@ -3087,7 +3265,7 @@ pi_result cuda_piEnqueueMemBufferReadRect( pi_result retErr = PI_SUCCESS; CUstream cuStream = command_queue->get(); - CUdeviceptr devPtr = buffer->get(); + CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -3138,7 +3316,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect( pi_result retErr = PI_SUCCESS; CUstream cuStream = command_queue->get(); - CUdeviceptr devPtr = buffer->get(); + CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -3197,8 +3375,8 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_result result; auto stream = command_queue->get(); - auto src = src_buffer->get() + src_offset; - auto dst = dst_buffer->get() + dst_offset; + auto src = src_buffer->mem_.buffer_mem_.get() + src_offset; + auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset; result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream)); @@ -3230,8 +3408,8 @@ pi_result cuda_piEnqueueMemBufferCopyRect( pi_result retErr = PI_SUCCESS; CUstream cuStream = command_queue->get(); - CUdeviceptr srcPtr = src_buffer->get(); - CUdeviceptr dstPtr = dst_buffer->get(); + CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get(); + CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { @@ -3295,7 +3473,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, pi_result result; - auto dstDevice = buffer->get() + offset; + auto dstDevice = buffer->mem_.buffer_mem_.get() + offset; auto stream = command_queue->get(); auto N = size / pattern_size; @@ -3358,17 +3536,158 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, return PI_ERROR_UNKNOWN; } } -/// \TODO Not implemented in CUDA, requires untie from OpenCL + +static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) { + switch (array_desc.Format) { + case CU_AD_FORMAT_UNSIGNED_INT8: + case CU_AD_FORMAT_SIGNED_INT8: + return 1; + case CU_AD_FORMAT_UNSIGNED_INT16: + case CU_AD_FORMAT_SIGNED_INT16: + case CU_AD_FORMAT_HALF: + return 2; + case CU_AD_FORMAT_UNSIGNED_INT32: + case CU_AD_FORMAT_SIGNED_INT32: + case CU_AD_FORMAT_FLOAT: + return 4; + } + cl::sycl::detail::pi::die("Invalid iamge format."); + return 0; +} + +/// General ND memory copy operation for images (where N > 1). +/// This function requires the corresponding CUDA context to be at the top of +/// the context stack +/// If the source and/or destination is an array, src_ptr and/or dst_ptr +/// must be a pointer to a CUarray +static pi_result commonEnqueueMemImageNDCopy( + CUstream cu_stream, pi_mem_type img_type, const size_t *region, + const void *src_ptr, const CUmemorytype_enum src_type, + const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type, + const size_t *dst_offset) { + assert(region != nullptr); + + assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST); + assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST); + + if (img_type == PI_MEM_TYPE_IMAGE2D) { + CUDA_MEMCPY2D cpyDesc; + memset(&cpyDesc, 0, sizeof(cpyDesc)); + cpyDesc.srcMemoryType = src_type; + if (src_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.srcArray = *static_cast(src_ptr); + cpyDesc.srcXInBytes = src_offset[0]; + cpyDesc.srcY = src_offset[1]; + } else { + cpyDesc.srcHost = src_ptr; + } + cpyDesc.dstMemoryType = dst_type; + if (dst_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.dstArray = *static_cast(dst_ptr); + cpyDesc.dstXInBytes = dst_offset[0]; + cpyDesc.dstY = dst_offset[1]; + } else { + cpyDesc.dstHost = dst_ptr; + } + cpyDesc.WidthInBytes = region[0]; + cpyDesc.Height = region[1]; + return PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream)); + } + if (img_type == PI_MEM_TYPE_IMAGE3D) { + CUDA_MEMCPY3D cpyDesc; + memset(&cpyDesc, 0, sizeof(cpyDesc)); + cpyDesc.srcMemoryType = src_type; + if (src_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.srcArray = *static_cast(src_ptr); + cpyDesc.srcXInBytes = src_offset[0]; + cpyDesc.srcY = src_offset[1]; + cpyDesc.srcZ = src_offset[2]; + } else { + cpyDesc.srcHost = src_ptr; + } + cpyDesc.dstMemoryType = dst_type; + if (dst_type == CU_MEMORYTYPE_ARRAY) { + cpyDesc.dstArray = *static_cast(dst_ptr); + cpyDesc.dstXInBytes = dst_offset[0]; + cpyDesc.dstY = dst_offset[1]; + cpyDesc.dstZ = dst_offset[2]; + } else { + cpyDesc.dstHost = dst_ptr; + } + cpyDesc.WidthInBytes = region[0]; + cpyDesc.Height = region[1]; + cpyDesc.Depth = region[2]; + return PI_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream)); + } + return PI_INVALID_VALUE; +} + pi_result cuda_piEnqueueMemImageRead( pi_queue command_queue, pi_mem image, pi_bool blocking_read, const size_t *origin, const size_t *region, size_t row_pitch, size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - cl::sycl::detail::pi::die("cuda_piEnqueueMemImageRead not implemented"); - return {}; + assert(command_queue != nullptr); + assert(image != nullptr); + assert(image->mem_type_ == _pi_mem::mem_type::surface); + + pi_result retErr = PI_SUCCESS; + CUstream cuStream = command_queue->get(); + + try { + ScopedContext active(command_queue->get_context()); + + if (event_wait_list) { + cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, + event_wait_list, nullptr); + } + + CUarray array = image->mem_.surface_mem_.get_array(); + + CUDA_ARRAY_DESCRIPTOR arrayDesc; + retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array)); + + int elementByteSize = imageElementByteSize(arrayDesc); + + size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels; + size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0]; + + pi_mem_type imgType = image->mem_.surface_mem_.get_image_type(); + if (imgType == PI_MEM_TYPE_IMAGE1D) { + retErr = PI_CHECK_ERROR( + cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream)); + } else { + size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; + size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]}; + + retErr = commonEnqueueMemImageNDCopy( + cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY, + srcOffset, ptr, CU_MEMORYTYPE_HOST, nullptr); + + if (retErr != PI_SUCCESS) { + return retErr; + } + } + + if (event) { + auto new_event = + _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ, command_queue); + new_event->record(); + *event = new_event; + } + + if (blocking_read) { + retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream)); + } + } catch (pi_result err) { + return err; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + return retErr; } -/// \TODO Not implemented in CUDA, requires untie from OpenCL pi_result cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, pi_bool blocking_write, const size_t *origin, @@ -3376,11 +3695,63 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, size_t input_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - cl::sycl::detail::pi::die("cuda_piEnqueueMemImageWrite not implemented"); - return {}; + assert(command_queue != nullptr); + assert(image != nullptr); + assert(image->mem_type_ == _pi_mem::mem_type::surface); + + pi_result retErr = PI_SUCCESS; + CUstream cuStream = command_queue->get(); + + try { + ScopedContext active(command_queue->get_context()); + + if (event_wait_list) { + cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, + event_wait_list, nullptr); + } + + CUarray array = image->mem_.surface_mem_.get_array(); + + CUDA_ARRAY_DESCRIPTOR arrayDesc; + retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array)); + + int elementByteSize = imageElementByteSize(arrayDesc); + + size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels; + size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0]; + + pi_mem_type imgType = image->mem_.surface_mem_.get_image_type(); + if (imgType == PI_MEM_TYPE_IMAGE1D) { + retErr = PI_CHECK_ERROR( + cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream)); + } else { + size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; + size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]}; + + retErr = commonEnqueueMemImageNDCopy( + cuStream, imgType, adjustedRegion, ptr, CU_MEMORYTYPE_HOST, nullptr, + &array, CU_MEMORYTYPE_ARRAY, dstOffset); + + if (retErr != PI_SUCCESS) { + return retErr; + } + } + + if (event) { + auto new_event = + _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE, command_queue); + new_event->record(); + *event = new_event; + } + } catch (pi_result err) { + return err; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + return retErr; } -/// \TODO Not implemented in CUDA, requires untie from OpenCL pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, pi_mem dst_image, const size_t *src_origin, const size_t *dst_origin, @@ -3388,8 +3759,72 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { - cl::sycl::detail::pi::die("cuda_piEnqueueMemImageCopy not implemented"); - return {}; + assert(src_image->mem_type_ == _pi_mem::mem_type::surface); + assert(dst_image->mem_type_ == _pi_mem::mem_type::surface); + assert(src_image->mem_.surface_mem_.get_image_type() == + dst_image->mem_.surface_mem_.get_image_type()); + + pi_result retErr = PI_SUCCESS; + CUstream cuStream = command_queue->get(); + + try { + ScopedContext active(command_queue->get_context()); + + if (event_wait_list) { + cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, + event_wait_list, nullptr); + } + + CUarray srcArray = src_image->mem_.surface_mem_.get_array(); + CUarray dstArray = dst_image->mem_.surface_mem_.get_array(); + + CUDA_ARRAY_DESCRIPTOR srcArrayDesc; + retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray)); + CUDA_ARRAY_DESCRIPTOR dstArrayDesc; + retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray)); + + assert(srcArrayDesc.Format == dstArrayDesc.Format); + assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels); + + int elementByteSize = imageElementByteSize(srcArrayDesc); + + size_t dstByteOffsetX = + dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels; + size_t srcByteOffsetX = + src_origin[0] * elementByteSize * dstArrayDesc.NumChannels; + size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0]; + + pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type(); + if (imgType == PI_MEM_TYPE_IMAGE1D) { + retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray, + srcByteOffsetX, bytesToCopy)); + } else { + size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]}; + size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]}; + size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]}; + + retErr = commonEnqueueMemImageNDCopy( + cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY, + srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset); + + if (retErr != PI_SUCCESS) { + return retErr; + } + } + + if (event) { + auto new_event = + _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, command_queue); + new_event->record(); + *event = new_event; + } + } catch (pi_result err) { + return err; + } catch (...) { + return PI_ERROR_UNKNOWN; + } + + return retErr; } /// \TODO Not implemented in CUDA, requires untie from OpenCL @@ -3421,13 +3856,13 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, pi_result ret_err = PI_INVALID_OPERATION; // Currently no support for overlapping regions - if (buffer->get_map_ptr() != nullptr) { + if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) { return ret_err; } // Allocate a pointer in the host to store the mapped information - auto hostPtr = buffer->map_to_ptr(offset, map_flags); - *ret_map = buffer->get_map_ptr(); + auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags); + *ret_map = buffer->mem_.buffer_mem_.get_map_ptr(); if (hostPtr) { ret_err = PI_SUCCESS; } @@ -3466,15 +3901,17 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, assert(command_queue != nullptr); assert(mapped_ptr != nullptr); assert(memobj != nullptr); - assert(memobj->get_map_ptr() != nullptr); - assert(memobj->get_map_ptr() == mapped_ptr); + assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr); + assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr); - if ((memobj->get_map_flags() & CL_MAP_WRITE) || - (memobj->get_map_flags() & CL_MAP_WRITE_INVALIDATE_REGION)) { + if ((memobj->mem_.buffer_mem_.get_map_flags() & CL_MAP_WRITE) || + (memobj->mem_.buffer_mem_.get_map_flags() & + CL_MAP_WRITE_INVALIDATE_REGION)) { ret_err = cuda_piEnqueueMemBufferWrite( - command_queue, memobj, true, memobj->get_map_offset(mapped_ptr), - memobj->get_size(), mapped_ptr, num_events_in_wait_list, - event_wait_list, retEvent); + command_queue, memobj, true, + memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr), + memobj->mem_.buffer_mem_.get_size(), mapped_ptr, + num_events_in_wait_list, event_wait_list, retEvent); } else { if (retEvent) { try { @@ -3489,7 +3926,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, } } - memobj->unmap(mapped_ptr); + memobj->mem_.buffer_mem_.unmap(mapped_ptr); return ret_err; } diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 333264ad6b011..b0b1fa2b6ae7e 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -179,102 +179,146 @@ struct _pi_context { /// PI Mem mapping to a CUDA memory allocation /// struct _pi_mem { - using native_type = CUdeviceptr; + + // TODO: Move as much shared data up as possible using pi_context = _pi_context *; pi_context context_; - pi_mem parent_; - native_type ptr_; - - void *hostPtr_; - size_t size_; - size_t mapOffset_; - void *mapPtr_; - cl_map_flags mapFlags_; std::atomic_uint32_t refCount_; - /** alloc_mode - * classic: Just a normal buffer allocated on the device via cuda malloc - * use_host_ptr: Use an address on the host for the device - * copy_in: The data for the device comes from the host but the host pointer - is not available later for re-use - * alloc_host_ptr: Uses pinned-memory allocation - */ - enum class alloc_mode { - classic, - use_host_ptr, - copy_in, - alloc_host_ptr - } allocMode_; - - _pi_mem(pi_context ctxt, pi_mem parent, alloc_mode mode, CUdeviceptr ptr, void *host_ptr, - size_t size) - : context_{ctxt}, parent_{parent}, ptr_{ptr}, hostPtr_{host_ptr}, size_{size}, - mapOffset_{0}, mapPtr_{nullptr}, mapFlags_{CL_MAP_WRITE}, refCount_{1}, allocMode_{mode} { - if (is_sub_buffer()) { - cuda_piMemRetain(parent_); - } else { - cuda_piContextRetain(context_); + enum class mem_type { buffer, surface } mem_type_; + + union mem_ { + struct buffer_mem_ { + using native_type = CUdeviceptr; + + pi_mem parent_; + native_type ptr_; + void *hostPtr_; + size_t size_; + + size_t mapOffset_; + void *mapPtr_; + cl_map_flags mapFlags_; + + /** alloc_mode + * classic: Just a normal buffer allocated on the device via cuda malloc + * use_host_ptr: Use an address on the host for the device + * copy_in: The data for the device comes from the host but the host + pointer is not available later for re-use + * alloc_host_ptr: Uses pinned-memory allocation + */ + enum class alloc_mode { + classic, + use_host_ptr, + copy_in, + alloc_host_ptr + } allocMode_; + + native_type get() const noexcept { return ptr_; } + + size_t get_size() const noexcept { return size_; } + + void *get_map_ptr() const noexcept { return mapPtr_; } + + size_t get_map_offset(void *ptr) const noexcept { return mapOffset_; } + + void *map_to_ptr(size_t offset, cl_map_flags flags) noexcept { + assert(mapPtr_ == nullptr); + mapOffset_ = offset; + mapFlags_ = flags; + if (hostPtr_) { + mapPtr_ = static_cast(hostPtr_) + offset; + } else { + // TODO: Allocate only what is needed based on the offset + mapPtr_ = static_cast(malloc(this->get_size())); + } + return mapPtr_; } - }; - - ~_pi_mem() { - if (is_sub_buffer()) { - cuda_piMemRelease(parent_); - } else { - cuda_piContextRelease(context_); - } - } - - /// \TODO: Adapt once images are supported. - bool is_buffer() const noexcept { return true; } - bool is_sub_buffer() const noexcept { - return (is_buffer() && (parent_ != nullptr)); - } + void unmap(void *ptr) noexcept { + assert(mapPtr_ != nullptr); - native_type get() const noexcept { return ptr_; } - - pi_context get_context() const noexcept { return context_; } + if (mapPtr_ != hostPtr_) { + free(mapPtr_); + } + mapPtr_ = nullptr; + mapOffset_ = 0; + } - pi_uint32 increment_reference_count() noexcept { return ++refCount_; } + cl_map_flags get_map_flags() const noexcept { + assert(mapPtr_ != nullptr); + return mapFlags_; + } + } buffer_mem_; + + struct surface_mem_ { + CUarray array_; + CUsurfObject surfObj_; + pi_mem_type imageType_; + + CUarray get_array() const noexcept { return array_; } + + CUsurfObject get_surface() const noexcept { return surfObj_; } + + pi_mem_type get_image_type() const noexcept { return imageType_; } + } surface_mem_; + } mem_; + + // Buffer constructor + _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode, + CUdeviceptr ptr, void *host_ptr, size_t size) + : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} { + mem_.buffer_mem_.ptr_ = ptr; + mem_.buffer_mem_.parent_ = parent; + mem_.buffer_mem_.hostPtr_ = host_ptr; + mem_.buffer_mem_.size_ = size; + mem_.buffer_mem_.mapOffset_ = 0; + mem_.buffer_mem_.mapPtr_ = nullptr; + mem_.buffer_mem_.mapFlags_ = CL_MAP_WRITE; + mem_.buffer_mem_.allocMode_ = mode; + if (is_sub_buffer()) { + cuda_piMemRetain(mem_.buffer_mem_.parent_); + } else { + cuda_piContextRetain(context_); + } + }; - pi_uint32 decrement_reference_count() noexcept { return --refCount_; } + // Surface constructor + _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf, + pi_mem_type image_type, void *host_ptr) + : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} { + mem_.surface_mem_.array_ = array; + mem_.surface_mem_.surfObj_ = surf; + mem_.surface_mem_.imageType_ = image_type; + cuda_piContextRetain(context_); + } - pi_uint32 get_reference_count() const noexcept { return refCount_; } + ~_pi_mem() { + if (mem_type_ == mem_type::buffer) { + if (is_sub_buffer()) { + cuda_piMemRelease(mem_.buffer_mem_.parent_); + return; + } + } + cuda_piContextRelease(context_); + } - size_t get_size() const noexcept { return size_; } + // TODO: Move as many shared funcs up as possible + bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; } - void *get_map_ptr() const noexcept { return mapPtr_; } + bool is_sub_buffer() const noexcept { + return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr)); + } - size_t get_map_offset(void *ptr) const noexcept { return mapOffset_; } + bool is_image() const noexcept { return mem_type_ == mem_type::surface; } - void *map_to_ptr(size_t offset, cl_map_flags flags) noexcept { - assert(mapPtr_ == nullptr); - mapOffset_ = offset; - mapFlags_ = flags; - if (hostPtr_ && (allocMode_ != alloc_mode::copy_in)) { - mapPtr_ = static_cast(hostPtr_) + offset; - } else { - // TODO: Allocate only what is needed based on the offset - mapPtr_ = static_cast(malloc(this->get_size())); - } - return mapPtr_; - } + pi_context get_context() const noexcept { return context_; } - void unmap(void *ptr) noexcept { - assert(mapPtr_ != nullptr); + pi_uint32 increment_reference_count() noexcept { return ++refCount_; } - if (mapPtr_ != hostPtr_) { - free(mapPtr_); - } - mapPtr_ = nullptr; - mapOffset_ = 0; - } + pi_uint32 decrement_reference_count() noexcept { return --refCount_; } - cl_map_flags get_map_flags() const noexcept { - assert(mapPtr_ != nullptr); - return mapFlags_; - } + pi_uint32 get_reference_count() const noexcept { return refCount_; } }; /// PI queue mapping on to CUstream objects.