diff --git a/.github/workflows/generate-coverage.yaml b/.github/workflows/generate-coverage.yaml index 5975837d55..7e0c8b8667 100644 --- a/.github/workflows/generate-coverage.yaml +++ b/.github/workflows/generate-coverage.yaml @@ -79,7 +79,7 @@ jobs: - name: Install dpctl dependencies shell: bash -l {0} run: | - pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml] + pip install numpy"<1.26.0" cython setuptools pytest pytest-cov scikit-build cmake coverage[toml] - name: Build dpctl with coverage shell: bash -l {0} diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml index d12747f3a9..e7a27f633c 100644 --- a/.github/workflows/os-llvm-sycl-build.yml +++ b/.github/workflows/os-llvm-sycl-build.yml @@ -108,7 +108,7 @@ jobs: - name: Install dpctl dependencies shell: bash -l {0} run: | - pip install numpy cython setuptools pytest scikit-build cmake ninja + pip install numpy"<1.26.0" cython setuptools pytest scikit-build cmake ninja - name: Checkout repo uses: actions/checkout@v3 diff --git a/CHANGELOG.md b/CHANGELOG.md index f85446fc9a..fc5bb4db36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,15 +4,43 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [dev] +## [0.15.0] ### Added + +* Added `dpctl.tensor.floor`, `dpctl.tensor.ceil`, `dpctl.tensor.trunc` elementwise functions. +* Added `dpctl.tensor.hypot`, `dpctl.tensor.logaddexp` elementwise functions. +* Added trigonometric (`dpctl.tensor.sin`, `dpctl.tensor.cos`, `dpctl.tensor.tan`) and hyperbolic (`dpctl.tensor.sinh`, `dpctl.tensor.cosh`, `dpctl.tensor.tanh`) elementwise functions and their inverses (`dpctl.tensor.asin`, `dpctl.tensor.asinh`, `dpctl.tensor.acos`, `dpctl.tensor.acosh`, `dpctl.tensor.atan`, `dpctl.tensor.atanh`). +* Added `dpctl.tensor.round` function. +* Added `dpctl.tensor.sign` and `dpctl.tensor.remainder` elementwise functions. +* Added bitwise elementwise functions `dpctl.tensor.bitwise_and`, `dpctl.tensor.bitwise_xor`, `dpctl.tensor.bitwise_or`, `dpctl.tensor.bitwise_invert` +* Added bitwise shift functions `dpctl.tensor.bitwise_left_shift` and `dpctl.tensor.bitwise_right_shift`. +* Added `dpctl.tensor.atan2` and `dpctl.tensor.signbit` elementwise functions. +* Added `dpctl.tensor.minumum` and `dpctl.tensor.maximum` binary elementwise functions. +* Supported equality checking and hashing for `dpctl.SyclPlatform`. +* Implemented `types` property for all unary and binary elementwise functions [#1361](https://github.com/IntelPython/dpctl/pull/1361) +* Added `dpctl.tensor.repeat` and `dpctl.tensor.tile` functions. +* Added `dpctl.tensor.matrix_transpose ` function. + ### Changed -* Removed `dpctl.tensor.numpy_usm_shared` obsolete class and associated tests which were being skipped +* Enabled support for Python arithmetic, in-place arithmetic, reflexive arithmetic, comparison, and bitwise operators for `dpctl.tensor.usm_ndarray` type [#1324](https://github.com/IntelPython/dpctl/pull/1324). +* Removed `dpctl.tensor.numpy_usm_shared` obsolete class and associated tests which were being skipped [#1310](https://github.com/IntelPython/dpctl/pull/1310) +* Transitioned `dpctl` codebase to Cython 3. +* Improved performance of boolean reduction functions `dpctl.tensor.all` and `dpctl.tensor.any`. +* Improved performance of summation function `dpctl.tensor.sum`. +* Improved in-place arithmetic operations for addition, subtraction and multiplication. +* Updated codebase per SYCL-2020 intel/llvm compiler deprecation warnings. +* Improved performance of advanced boolean indexing for arrays whose size fits in 32-bit signed integer type. +* Removed deprecated `DPCTLDevice_GetMaxWorkItemSizes` function from the SyclInterface library. +* Improved performance of `dpctl.tensor.reshape` in the case when a copy is being made. +* Improved performance of `dpctl.tensor.roll` function. ### Fixed +* Fixed issues identified by Coverity security scans. +* Fixed issues [#1279](https://github.com/IntelPython/dpctl/issues/1279), [#1350](https://github.com/IntelPython/dpctl/issues/1350), [#1344](https://github.com/IntelPython/dpctl/issues/1344), [#1327](https://github.com/IntelPython/dpctl/issues/1327), [#1241](https://github.com/IntelPython/dpctl/issues/1241), [#1250](https://github.com/IntelPython/dpctl/issues/1250), [#1293](https://github.com/IntelPython/dpctl/issues/1293). + ## [0.14.5] - 07/17/2023 ### Added diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 49f25aef6a..456eebdbaa 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -33,6 +33,7 @@ endif() set(python_module_name _tensor_impl) pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -49,6 +50,7 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ) set(_clang_prefix "") if (WIN32) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index ad51689f3f..f0930004ec 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -72,11 +72,13 @@ iinfo, moveaxis, permute_dims, + repeat, result_type, roll, squeeze, stack, swapaxes, + tile, unstack, ) from dpctl.tensor._print import ( @@ -305,4 +307,6 @@ "tanh", "trunc", "allclose", + "repeat", + "tile", ] diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index cb54556ed2..7201cd96fb 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -15,10 +15,11 @@ # limitations under the License. +import itertools import operator -from itertools import chain, repeat import numpy as np +from numpy import AxisError from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple import dpctl @@ -133,7 +134,9 @@ def _broadcast_shape_impl(shapes): diff = biggest - nds[i] if diff > 0: ty = type(shapes[i]) - shapes[i] = ty(chain(repeat(1, diff), shapes[i])) + shapes[i] = ty( + itertools.chain(itertools.repeat(1, diff), shapes[i]) + ) common_shape = [] for axis in range(biggest): lengths = [s[axis] for s in shapes] @@ -918,6 +921,302 @@ def swapaxes(X, axis1, axis2): return dpt.permute_dims(X, tuple(ind)) +def repeat(x, repeats, axis=None): + """repeat(x, repeats, axis=None) + + Repeat elements of an array. + + Args: + x (usm_ndarray): input array + + repeat (Union[int, Tuple[int, ...]]): + The number of repetitions for each element. + `repeats` is broadcasted to fit the shape of the given axis. + + axis (Optional[int]): + The axis along which to repeat values. The `axis` is required + if input array has more than one dimension. + + Returns: + usm_narray: + Array with repeated elements. + The returned array must have the same data type as `x`, + is created on the same device as `x` and has the same USM + allocation type as `x`. + + Raises: + AxisError: if `axis` value is invalid. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + f"`axis` cannot be `None` for array of dimension {x_ndim}" + ) + axis = 0 + + x_shape = x.shape + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + axis_size = x_shape[axis] + else: + if axis != 0: + AxisError("`axis` must be `0` for input of dimension `0`") + axis_size = x.size + + scalar = False + if isinstance(repeats, int): + if repeats < 0: + raise ValueError("`repeats` must be a positive integer") + usm_type = x.usm_type + exec_q = x.sycl_queue + scalar = True + elif isinstance(repeats, dpt.usm_ndarray): + if repeats.ndim > 1: + raise ValueError( + "`repeats` array must be 0- or 1-dimensional, got" + "{repeats.ndim}" + ) + exec_q = dpctl.utils.get_execution_queue( + (x.sycl_queue, repeats.sycl_queue) + ) + if exec_q is None: + raise dputils.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + usm_type = dpctl.utils.get_coerced_usm_type( + ( + x.usm_type, + repeats.usm_type, + ) + ) + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"): + raise TypeError( + f"`repeats` data type `{repeats.dtype}` cannot be cast to " + "`int64` according to the casting rule ''safe.''" + ) + if repeats.size == 1: + scalar = True + # bring the single element to the host + repeats = int(repeats) + if repeats < 0: + raise ValueError("`repeats` elements must be positive") + else: + if repeats.size != axis_size: + raise ValueError( + "`repeats` array must be broadcastable to the size of " + "the repeated axis" + ) + if not dpt.all(repeats >= 0): + raise ValueError("`repeats` elements must be positive") + + elif isinstance(repeats, tuple): + usm_type = x.usm_type + exec_q = x.sycl_queue + + len_reps = len(repeats) + if len_reps != axis_size: + raise ValueError( + "`repeats` tuple must have the same length as the repeated " + "axis" + ) + elif len_reps == 1: + repeats = repeats[0] + if repeats < 0: + raise ValueError("`repeats` elements must be positive") + scalar = True + else: + repeats = dpt.asarray( + repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q + ) + if not dpt.all(repeats >= 0): + raise ValueError("`repeats` elements must be positive") + else: + raise TypeError( + "Expected int, tuple, or `usm_ndarray` for second argument," + f"got {type(repeats)}" + ) + + if axis_size == 0: + return dpt.empty(x_shape, dtype=x.dtype, sycl_queue=exec_q) + + if scalar: + res_axis_size = repeats * axis_size + res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + res = dpt.empty( + res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + ) + if res_axis_size > 0: + ht_rep_ev, _ = ti._repeat_by_scalar( + src=x, + dst=res, + reps=repeats, + axis=axis, + sycl_queue=exec_q, + ) + ht_rep_ev.wait() + else: + if repeats.dtype != dpt.int64: + rep_buf = dpt.empty( + repeats.shape, + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=repeats, dst=rep_buf, sycl_queue=exec_q + ) + cumsum = dpt.empty( + (axis_size,), + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + # _cumsum_1d synchronizes so `depends` ends here safely + res_axis_size = ti._cumsum_1d( + rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev] + ) + res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + res = dpt.empty( + res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + ) + if res_axis_size > 0: + ht_rep_ev, _ = ti._repeat_by_sequence( + src=x, + dst=res, + reps=rep_buf, + cumsum=cumsum, + axis=axis, + sycl_queue=exec_q, + ) + ht_rep_ev.wait() + ht_copy_ev.wait() + else: + cumsum = dpt.empty( + (axis_size,), + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + # _cumsum_1d synchronizes so `depends` ends here safely + res_axis_size = ti._cumsum_1d(repeats, cumsum, sycl_queue=exec_q) + res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + res = dpt.empty( + res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + ) + if res_axis_size > 0: + ht_rep_ev, _ = ti._repeat_by_sequence( + src=x, + dst=res, + reps=repeats, + cumsum=cumsum, + axis=axis, + sycl_queue=exec_q, + ) + ht_rep_ev.wait() + return res + + +def tile(x, repetitions): + """tile(x, repetitions) + + Repeat an input array `x` along each axis a number of times given by + `repetitions`. + + For `N` = len(`repetitions`) and `M` = len(`x.shape`): + - if `M < N`, `x` will have `N - M` new axes prepended to its shape + - if `M > N`, `repetitions` will have `M - N` new axes 1 prepended to it + + Args: + x (usm_ndarray): input array + + repetitions (Union[int, Tuple[int, ...]]): + The number of repetitions for each dimension. + + Returns: + usm_narray: + Array with tiled elements. + The returned array must have the same data type as `x`, + is created on the same device as `x` and has the same USM + allocation type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + + if not isinstance(repetitions, tuple): + if isinstance(repetitions, int): + repetitions = (repetitions,) + else: + raise TypeError( + f"Expected tuple or integer type, got {type(repetitions)}." + ) + + # case of scalar + if x.size == 1: + if not repetitions: + # handle empty tuple + repetitions = (1,) + return dpt.full( + repetitions, + x, + dtype=x.dtype, + usm_type=x.usm_type, + sycl_queue=x.sycl_queue, + ) + rep_dims = len(repetitions) + x_dims = x.ndim + if rep_dims < x_dims: + repetitions = (x_dims - rep_dims) * (1,) + repetitions + elif x_dims < rep_dims: + x = dpt.reshape(x, (rep_dims - x_dims) * (1,) + x.shape) + res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions)) + # case of empty input + if x.size == 0: + return dpt.empty( + res_shape, x.dtype, usm_type=x.usm_type, sycl_queue=x.sycl_queue + ) + in_sh = x.shape + if res_shape == in_sh: + return dpt.copy(x) + expanded_sh = [] + broadcast_sh = [] + out_sz = 1 + for i in range(len(res_shape)): + out_sz *= res_shape[i] + reps, sh = repetitions[i], in_sh[i] + if reps == 1: + # dimension will be unchanged + broadcast_sh.append(sh) + expanded_sh.append(sh) + elif sh == 1: + # dimension will be broadcast + broadcast_sh.append(reps) + expanded_sh.append(sh) + else: + broadcast_sh.extend([reps, sh]) + expanded_sh.extend([1, sh]) + exec_q = x.sycl_queue + res = dpt.empty((out_sz,), x.dtype, usm_type=x.usm_type, sycl_queue=exec_q) + # no need to copy data for empty output + if out_sz > 0: + x = dpt.broadcast_to( + # this reshape should never copy + dpt.reshape(x, expanded_sh), + broadcast_sh, + ) + # copy broadcast input into flat array + hev, _ = ti._copy_usm_ndarray_for_reshape( + src=x, dst=res, sycl_queue=exec_q + ) + hev.wait() + return dpt.reshape(res, res_shape) + + def _supported_dtype(dtypes): for dtype in dtypes: if dtype.char not in "?bBhHiIlLqQefdFD": diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp new file mode 100644 index 0000000000..0072690e9b --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -0,0 +1,387 @@ +//=== accumulators.hpp - Implementation of accumulator kernels --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for accumulators (cumulative sum, prod, etc.). +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace accumulators +{ + +namespace py = pybind11; + +using namespace dpctl::tensor::offset_utils; + +template T ceiling_quotient(T n, T m) +{ + return (n + m - 1) / m; +} +template T1 ceiling_quotient(T1 n, T2 m) +{ + return ceiling_quotient(n, static_cast(m)); +} + +template +class inclusive_scan_rec_local_scan_krn; + +template +class inclusive_scan_rec_chunk_update_krn; + +template struct NonZeroIndicator +{ + NonZeroIndicator() {} + + outputT operator()(const inputT &val) const + { + constexpr outputT out_one(1); + constexpr outputT out_zero(0); + constexpr inputT val_zero(0); + + return (val == val_zero) ? out_zero : out_one; + } +}; + +template struct NoOpTransformer +{ + NoOpTransformer() {} + + T operator()(const T &val) const + { + return val; + } +}; + +/* + * for integer type maskT, + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * for 0 <= j < n_elems + */ +template +sycl::event inclusive_scan_rec(sycl::queue exec_q, + size_t n_elems, + size_t wg_size, + const inputT *input, + outputT *output, + size_t s0, + size_t s1, + IndexerT indexer, + TransformerT transformer, + std::vector const &depends = {}) +{ + size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size); + + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using slmT = sycl::local_accessor; + + auto lws = sycl::range<1>(wg_size); + auto gws = sycl::range<1>(n_groups * wg_size); + + slmT slm_iscan_tmp(lws, cgh); + + cgh.parallel_for>( + sycl::nd_range<1>(gws, lws), [=](sycl::nd_item<1> it) + { + auto chunk_gid = it.get_global_id(0); + auto lid = it.get_local_id(0); + + std::array local_isum; + + size_t i = chunk_gid * n_wi; + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + constexpr outputT out_zero(0); + + local_isum[m_wi] = + (i + m_wi < n_elems) + ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) + : out_zero; + } + +// local_isum is now result of +// inclusive scan of locally stored mask indicators +#pragma unroll + for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += local_isum[m_wi - 1]; + } + + size_t wg_iscan_val = + sycl::inclusive_scan_over_group(it.get_group(), + local_isum.back(), + sycl::plus(), + size_t(0)); + + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; + it.barrier(sycl::access::fence_space::local_space); + +#pragma unroll + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += addand; + } + + for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) { + output[i + m_wi] = local_isum[m_wi]; + } + }); + }); + + sycl::event out_event = inc_scan_phase1_ev; + if (n_groups > 1) { + outputT *temp = sycl::malloc_device(n_groups - 1, exec_q); + + auto chunk_size = wg_size * n_wi; + + NoOpIndexer _no_op_indexer{}; + NoOpTransformer _no_op_transformer{}; + auto e2 = inclusive_scan_rec( + exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1, + chunk_size, _no_op_indexer, _no_op_transformer, + {inc_scan_phase1_ev}); + + // output[ chunk_size * (i + 1) + j] += temp[i] + auto e3 = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(e2); + cgh.parallel_for>( + {n_elems}, [=](auto wiid) + { + auto gid = wiid[0]; + auto i = (gid / chunk_size); + output[gid] += (i > 0) ? temp[i - 1] : 0; + }); + }); + + sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(e3); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); + }); + + out_event = e4; + } + + return out_event; +} + +typedef size_t (*accumulate_contig_impl_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + char *, + std::vector const &); + +template +size_t accumulate_contig_impl(sycl::queue q, + size_t n_elems, + const char *mask, + char *cumsum, + std::vector const &depends = {}) +{ + constexpr int n_wi = 8; + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + size_t wg_size = 128; + + NoOpIndexer flat_indexer{}; + transformerT non_zero_indicator{}; + + sycl::event comp_ev = + inclusive_scan_rec( + q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, + flat_indexer, non_zero_indicator, depends); + + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); + + if (last_elem_host_usm == nullptr) { + throw std::bad_alloc(); + } + sycl::event copy_e = + q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + copy_e.wait(); + size_t return_val = static_cast(*last_elem_host_usm); + sycl::free(last_elem_host_usm, q); + + return return_val; +} + +template struct MaskPositionsContigFactoryForInt32 +{ + fnT get() + { + using cumsumT = std::int32_t; + fnT fn = + accumulate_contig_impl>; + return fn; + } +}; + +template struct MaskPositionsContigFactoryForInt64 +{ + fnT get() + { + using cumsumT = std::int64_t; + fnT fn = + accumulate_contig_impl>; + return fn; + } +}; + +template struct Cumsum1DContigFactory +{ + fnT get() + { + if constexpr (std::is_integral_v) { + using cumsumT = std::int64_t; + fnT fn = + accumulate_contig_impl>; + return fn; + } + else { + return nullptr; + } + } +}; + +typedef size_t (*accumulate_strided_impl_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + int, + const py::ssize_t *, + char *, + std::vector const &); + +template +size_t accumulate_strided_impl(sycl::queue q, + size_t n_elems, + const char *mask, + int nd, + const py::ssize_t *shape_strides, + char *cumsum, + std::vector const &depends = {}) +{ + constexpr int n_wi = 8; + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + size_t wg_size = 128; + + StridedIndexer strided_indexer{nd, 0, shape_strides}; + transformerT non_zero_indicator{}; + + sycl::event comp_ev = + inclusive_scan_rec( + q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, + strided_indexer, non_zero_indicator, depends); + + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); + + if (last_elem_host_usm == nullptr) { + throw std::bad_alloc(); + } + sycl::event copy_e = + q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); + copy_e.wait(); + size_t return_val = static_cast(*last_elem_host_usm); + sycl::free(last_elem_host_usm, q); + + return return_val; +} + +template struct MaskPositionsStridedFactoryForInt32 +{ + fnT get() + { + using cumsumT = std::int32_t; + fnT fn = + accumulate_strided_impl>; + return fn; + } +}; + +template struct MaskPositionsStridedFactoryForInt64 +{ + fnT get() + { + using cumsumT = std::int64_t; + fnT fn = + accumulate_strided_impl>; + return fn; + } +}; + +template struct Cumsum1DStridedFactory +{ + fnT get() + { + if constexpr (std::is_integral_v) { + using cumsumT = std::int64_t; + fnT fn = + accumulate_strided_impl>; + return fn; + } + else { + return nullptr; + } + } +}; + +} // namespace accumulators +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 43c546860b..595fc68496 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -24,7 +24,6 @@ #pragma once #include -#include #include #include #include @@ -47,172 +46,6 @@ namespace py = pybind11; using namespace dpctl::tensor::offset_utils; -template T ceiling_quotient(T n, T m) -{ - return (n + m - 1) / m; -} -template T1 ceiling_quotient(T1 n, T2 m) -{ - return ceiling_quotient(n, static_cast(m)); -} - -template -class inclusive_scan_rec_local_scan_krn; - -template -class inclusive_scan_rec_chunk_update_krn; - -template struct NonZeroIndicator -{ - NonZeroIndicator() {} - - outputT operator()(const inputT &val) const - { - constexpr outputT out_one(1); - constexpr outputT out_zero(0); - constexpr inputT val_zero(0); - - return (val == val_zero) ? out_zero : out_one; - } -}; - -template struct NoOpTransformer -{ - NoOpTransformer() {} - - T operator()(const T &val) const - { - return val; - } -}; - -/* - * for integer type maskT, - * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) - * for 0 <= j < n_elems - */ -template -sycl::event inclusive_scan_rec(sycl::queue exec_q, - size_t n_elems, - size_t wg_size, - const inputT *input, - outputT *output, - size_t s0, - size_t s1, - IndexerT indexer, - TransformerT transformer, - std::vector const &depends = {}) -{ - size_t n_groups = ceiling_quotient(n_elems, n_wi * wg_size); - - sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - using slmT = sycl::local_accessor; - - auto lws = sycl::range<1>(wg_size); - auto gws = sycl::range<1>(n_groups * wg_size); - - slmT slm_iscan_tmp(lws, cgh); - - cgh.parallel_for>( - sycl::nd_range<1>(gws, lws), [=](sycl::nd_item<1> it) - { - auto chunk_gid = it.get_global_id(0); - auto lid = it.get_local_id(0); - - std::array local_isum; - - size_t i = chunk_gid * n_wi; - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - constexpr outputT out_zero(0); - - local_isum[m_wi] = - (i + m_wi < n_elems) - ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) - : out_zero; - } - -// local_isum is now result of -// inclusive scan of locally stored mask indicators -#pragma unroll - for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += local_isum[m_wi - 1]; - } - - size_t wg_iscan_val = - sycl::inclusive_scan_over_group(it.get_group(), - local_isum.back(), - sycl::plus(), - size_t(0)); - - slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; - it.barrier(sycl::access::fence_space::local_space); - size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; - it.barrier(sycl::access::fence_space::local_space); - -#pragma unroll - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += addand; - } - - for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) { - output[i + m_wi] = local_isum[m_wi]; - } - }); - }); - - sycl::event out_event = inc_scan_phase1_ev; - if (n_groups > 1) { - outputT *temp = sycl::malloc_device(n_groups - 1, exec_q); - - auto chunk_size = wg_size * n_wi; - - NoOpIndexer _no_op_indexer{}; - NoOpTransformer _no_op_transformer{}; - auto e2 = inclusive_scan_rec( - exec_q, n_groups - 1, wg_size, output, temp, chunk_size - 1, - chunk_size, _no_op_indexer, _no_op_transformer, - {inc_scan_phase1_ev}); - - // output[ chunk_size * (i + 1) + j] += temp[i] - auto e3 = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(e2); - cgh.parallel_for>( - {n_elems}, [=](auto wiid) - { - auto gid = wiid[0]; - auto i = (gid / chunk_size); - output[gid] += (i > 0) ? temp[i - 1] : 0; - }); - }); - - sycl::event e4 = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(e3); - auto ctx = exec_q.get_context(); - cgh.host_task([ctx, temp]() { sycl::free(temp, ctx); }); - }); - - out_event = e4; - } - - return out_event; -} - template const &); - -template -size_t mask_positions_contig_impl(sycl::queue q, - size_t n_elems, - const char *mask, - char *cumsum, - std::vector const &depends = {}) -{ - constexpr int n_wi = 8; - const maskT *mask_data_ptr = reinterpret_cast(mask); - cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - size_t wg_size = 128; - - NoOpIndexer flat_indexer{}; - NonZeroIndicator non_zero_indicator{}; - - sycl::event comp_ev = - inclusive_scan_rec( - q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, - flat_indexer, non_zero_indicator, depends); - - cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); - - cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); - - if (last_elem_host_usm == nullptr) { - throw std::bad_alloc(); - } - sycl::event copy_e = - q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); - copy_e.wait(); - size_t return_val = static_cast(*last_elem_host_usm); - sycl::free(last_elem_host_usm, q); - - return return_val; -} - -template struct MaskPositionsContigFactoryForInt32 -{ - fnT get() - { - fnT fn = mask_positions_contig_impl; - return fn; - } -}; - -template struct MaskPositionsContigFactoryForInt64 -{ - fnT get() - { - fnT fn = mask_positions_contig_impl; - return fn; - } -}; - -typedef size_t (*mask_positions_strided_impl_fn_ptr_t)( - sycl::queue, - size_t, - const char *, - int, - const py::ssize_t *, - char *, - std::vector const &); - -template -size_t mask_positions_strided_impl(sycl::queue q, - size_t n_elems, - const char *mask, - int nd, - const py::ssize_t *shape_strides, - char *cumsum, - std::vector const &depends = {}) -{ - constexpr int n_wi = 8; - const maskT *mask_data_ptr = reinterpret_cast(mask); - cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); - size_t wg_size = 128; - - StridedIndexer strided_indexer{nd, 0, shape_strides}; - NonZeroIndicator non_zero_indicator{}; - - sycl::event comp_ev = - inclusive_scan_rec( - q, n_elems, wg_size, mask_data_ptr, cumsum_data_ptr, 0, 1, - strided_indexer, non_zero_indicator, depends); - - cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); - - cumsumT *last_elem_host_usm = sycl::malloc_host(1, q); - - if (last_elem_host_usm == nullptr) { - throw std::bad_alloc(); - } - sycl::event copy_e = - q.copy(last_elem, last_elem_host_usm, 1, {comp_ev}); - copy_e.wait(); - size_t return_val = static_cast(*last_elem_host_usm); - sycl::free(last_elem_host_usm, q); - - return return_val; -} - -template struct MaskPositionsStridedFactoryForInt32 -{ - fnT get() - { - fnT fn = mask_positions_strided_impl; - return fn; - } -}; - -template struct MaskPositionsStridedFactoryForInt64 -{ - fnT get() - { - fnT fn = mask_positions_strided_impl; - return fn; - } -}; - // ======= Masked extraction ================================ template it) const { - const size_t red_gws_ = it.get_global_range(0) / iter_gws_; - const size_t reduction_id = it.get_global_id(0) / red_gws_; - const size_t reduction_batch_id = get_reduction_batch_id(it); + const size_t reduction_id = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; const size_t wg_size = it.get_local_range(0); const size_t base = reduction_id * reduction_max_gid_; @@ -241,14 +241,6 @@ struct ContigBooleanReduction // in group_op_ group_op_(it, out_, reduction_id, inp_ + start, inp_ + end); } - -private: - size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const - { - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups; - return reduction_batch_id; - } }; typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)( @@ -268,17 +260,19 @@ class boolean_reduction_contig_krn; template class boolean_reduction_seq_contig_krn; +using dpctl::tensor::sycl_utils::choose_workgroup_size; + template sycl::event -boolean_reduction_contig_impl(sycl::queue exec_q, - size_t iter_nelems, - size_t reduction_nelems, - const char *arg_cp, - char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t red_arg_offset, - const std::vector &depends) +boolean_reduction_axis1_contig_impl(sycl::queue exec_q, + size_t iter_nelems, + size_t reduction_nelems, + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t red_arg_offset, + const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + iter_arg_offset + red_arg_offset; @@ -288,8 +282,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q, const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = - 4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); sycl::event red_ev; if (reduction_nelems < wg) { @@ -322,18 +315,8 @@ boolean_reduction_contig_impl(sycl::queue exec_q, }); } else { - sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) { - using IndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - - IndexerT res_indexer{}; - - cgh.depends_on(depends); - - cgh.parallel_for(sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { - auto res_offset = res_indexer(id[0]); - res_tp[res_offset] = identity_val; - }); - }); + sycl::event init_ev = exec_q.fill(res_tp, resTy(identity_val), + iter_nelems, depends); red_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(init_ev); @@ -363,7 +346,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q, return red_ev; } -template struct AllContigFactory +template struct AllAxis1ContigFactory { fnT get() const { @@ -372,12 +355,12 @@ template struct AllContigFactory using GroupOpT = all_reduce_wg_contig>; - return dpctl::tensor::kernels::boolean_reduction_contig_impl< + return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl< srcTy, resTy, RedOpT, GroupOpT>; } }; -template struct AnyContigFactory +template struct AnyAxis1ContigFactory { fnT get() const { @@ -386,7 +369,7 @@ template struct AnyContigFactory using GroupOpT = any_reduce_wg_contig>; - return dpctl::tensor::kernels::boolean_reduction_contig_impl< + return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl< srcTy, resTy, RedOpT, GroupOpT>; } }; @@ -433,9 +416,9 @@ struct StridedBooleanReduction void operator()(sycl::nd_item<1> it) const { - const size_t red_gws_ = it.get_global_range(0) / iter_gws_; - const size_t reduction_id = it.get_global_id(0) / red_gws_; - const size_t reduction_batch_id = get_reduction_batch_id(it); + const size_t reduction_id = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t reduction_lid = it.get_local_id(0); const size_t wg_size = it.get_local_range(0); @@ -468,13 +451,112 @@ struct StridedBooleanReduction // in group_op_ group_op_(it, out_, out_iter_offset, local_red_val); } +}; + +template +class boolean_reduction_axis0_contig_krn; + +template +sycl::event +boolean_reduction_axis0_contig_impl(sycl::queue exec_q, + size_t iter_nelems, + size_t reduction_nelems, + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t red_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + red_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + constexpr resTy identity_val = sycl::known_identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); -private: - size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const { - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups; - return reduction_batch_id; + sycl::event init_ev = exec_q.fill(res_tp, resTy(identity_val), + iter_nelems, depends); + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(init_ev); + + constexpr std::uint8_t dim = 1; + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + constexpr size_t preferred_reductions_per_wi = 4; + size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? ((reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + auto gws = sycl::range{iter_nelems * reduction_groups * wg}; + auto lws = sycl::range{wg}; + + cgh.parallel_for>( + sycl::nd_range(gws, lws), + StridedBooleanReduction( + arg_tp, res_tp, RedOpT(), GroupOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + }); + return red_ev; + } +} + +template struct AllAxis0ContigFactory +{ + fnT get() const + { + using resTy = std::int32_t; + using RedOpT = sycl::logical_and; + using GroupOpT = all_reduce_wg_strided; + + return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl< + srcTy, resTy, RedOpT, GroupOpT>; + } +}; + +template struct AnyAxis0ContigFactory +{ + fnT get() const + { + using resTy = std::int32_t; + using RedOpT = sycl::logical_or; + using GroupOpT = any_reduce_wg_strided; + + return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl< + srcTy, resTy, RedOpT, GroupOpT>; } }; @@ -527,8 +609,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q, const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); - size_t wg = - 4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes))); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); sycl::event red_ev; if (reduction_nelems < wg) { @@ -558,7 +639,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q, }); } else { - sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) { using IndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; @@ -576,7 +657,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q, }); }); red_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(res_init_ev); + cgh.depends_on(init_ev); constexpr std::uint8_t dim = 1; diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp new file mode 100644 index 0000000000..4129f358df --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp @@ -0,0 +1,438 @@ +//=== repeat.hpp - Implementation of repeat kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor repeating operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace repeat +{ + +namespace py = pybind11; +using namespace dpctl::tensor::offset_utils; + +template +class repeat_by_sequence_kernel; + +template +class RepeatSequenceFunctor +{ +private: + const T *src = nullptr; + T *dst = nullptr; + const repT *reps = nullptr; + const repT *cumsum = nullptr; + size_t src_axis_nelems = 1; + OrthogIndexer orthog_strider; + AxisIndexer src_axis_strider; + AxisIndexer dst_axis_strider; + RepIndexer reps_strider; + +public: + RepeatSequenceFunctor(const T *src_, + T *dst_, + const repT *reps_, + const repT *cumsum_, + size_t src_axis_nelems_, + OrthogIndexer orthog_strider_, + AxisIndexer src_axis_strider_, + AxisIndexer dst_axis_strider_, + RepIndexer reps_strider_) + : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_), + src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_), + src_axis_strider(src_axis_strider_), + dst_axis_strider(dst_axis_strider_), reps_strider(reps_strider_) + { + } + + void operator()(sycl::id<1> idx) const + { + size_t id = idx[0]; + auto i_orthog = id / src_axis_nelems; + auto i_along = id - (i_orthog * src_axis_nelems); + + auto orthog_offsets = orthog_strider(i_orthog); + auto src_offset = orthog_offsets.get_first_offset(); + auto dst_offset = orthog_offsets.get_second_offset(); + + auto val = src[src_offset + src_axis_strider(i_along)]; + auto last = cumsum[i_along]; + auto first = last - reps[reps_strider(i_along)]; + for (auto i = first; i < last; ++i) { + dst[dst_offset + dst_axis_strider(i)] = val; + } + } +}; + +typedef sycl::event (*repeat_by_sequence_fn_ptr_t)( + sycl::queue, + size_t, + size_t, + const char *, + char *, + const char *, + const char *, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event +repeat_by_sequence_impl(sycl::queue q, + size_t orthog_nelems, + size_t src_axis_nelems, + const char *src_cp, + char *dst_cp, + const char *reps_cp, + const char *cumsum_cp, + int orthog_nd, + const py::ssize_t *orthog_src_dst_shape_and_strides, + py::ssize_t src_offset, + py::ssize_t dst_offset, + py::ssize_t src_axis_shape, + py::ssize_t src_axis_stride, + py::ssize_t dst_axis_shape, + py::ssize_t dst_axis_stride, + py::ssize_t reps_shape, + py::ssize_t reps_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + const repT *reps_tp = reinterpret_cast(reps_cp); + const repT *cumsum_tp = reinterpret_cast(cumsum_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + TwoOffsets_StridedIndexer orthog_indexer{ + orthog_nd, src_offset, dst_offset, + orthog_src_dst_shape_and_strides}; + // indexers along repeated axis + Strided1DIndexer src_axis_indexer{0, src_axis_shape, src_axis_stride}; + Strided1DIndexer dst_axis_indexer{0, dst_axis_shape, dst_axis_stride}; + // indexer along reps array + Strided1DIndexer reps_indexer{0, reps_shape, reps_stride}; + + const size_t gws = orthog_nelems * src_axis_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatSequenceFunctor( + src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems, + orthog_indexer, src_axis_indexer, dst_axis_indexer, + reps_indexer)); + }); + + return repeat_ev; +} + +template struct RepeatSequenceFactory +{ + fnT get() + { + fnT fn = repeat_by_sequence_impl; + return fn; + } +}; + +typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + char *, + const char *, + const char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event repeat_by_sequence_1d_impl(sycl::queue q, + size_t src_nelems, + const char *src_cp, + char *dst_cp, + const char *reps_cp, + const char *cumsum_cp, + py::ssize_t src_shape, + py::ssize_t src_stride, + py::ssize_t dst_shape, + py::ssize_t dst_stride, + py::ssize_t reps_shape, + py::ssize_t reps_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + const repT *reps_tp = reinterpret_cast(reps_cp); + const repT *cumsum_tp = reinterpret_cast(cumsum_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + TwoZeroOffsets_Indexer orthog_indexer{}; + // indexers along repeated axis + Strided1DIndexer src_indexer{0, src_shape, src_stride}; + Strided1DIndexer dst_indexer{0, dst_shape, dst_stride}; + // indexer along reps array + Strided1DIndexer reps_indexer{0, reps_shape, reps_stride}; + + const size_t gws = src_nelems; + + cgh.parallel_for< + repeat_by_sequence_kernel>( + sycl::range<1>(gws), + RepeatSequenceFunctor( + src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer, + src_indexer, dst_indexer, reps_indexer)); + }); + + return repeat_ev; +} + +template struct RepeatSequence1DFactory +{ + fnT get() + { + fnT fn = repeat_by_sequence_1d_impl; + return fn; + } +}; + +template +class repeat_by_scalar_kernel; + +template +class RepeatScalarFunctor +{ +private: + const T *src = nullptr; + T *dst = nullptr; + const py::ssize_t reps = 1; + size_t dst_axis_nelems = 0; + OrthogIndexer orthog_strider; + AxisIndexer src_axis_strider; + AxisIndexer dst_axis_strider; + +public: + RepeatScalarFunctor(const T *src_, + T *dst_, + const py::ssize_t reps_, + size_t dst_axis_nelems_, + OrthogIndexer orthog_strider_, + AxisIndexer src_axis_strider_, + AxisIndexer dst_axis_strider_) + : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_), + orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_), + dst_axis_strider(dst_axis_strider_) + { + } + + void operator()(sycl::id<1> idx) const + { + size_t id = idx[0]; + auto i_orthog = id / dst_axis_nelems; + auto i_along = id - (i_orthog * dst_axis_nelems); + + auto orthog_offsets = orthog_strider(i_orthog); + auto src_offset = orthog_offsets.get_first_offset(); + auto dst_offset = orthog_offsets.get_second_offset(); + + auto dst_axis_offset = dst_axis_strider(i_along); + auto src_axis_offset = src_axis_strider(i_along / reps); + dst[dst_offset + dst_axis_offset] = src[src_offset + src_axis_offset]; + } +}; + +typedef sycl::event (*repeat_by_scalar_fn_ptr_t)( + sycl::queue, + size_t, + size_t, + const char *, + char *, + const py::ssize_t, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event repeat_by_scalar_impl(sycl::queue q, + size_t orthog_nelems, + size_t dst_axis_nelems, + const char *src_cp, + char *dst_cp, + const py::ssize_t reps, + int orthog_nd, + const py::ssize_t *orthog_shape_and_strides, + py::ssize_t src_offset, + py::ssize_t dst_offset, + py::ssize_t src_axis_shape, + py::ssize_t src_axis_stride, + py::ssize_t dst_axis_shape, + py::ssize_t dst_axis_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + TwoOffsets_StridedIndexer orthog_indexer{ + orthog_nd, src_offset, dst_offset, orthog_shape_and_strides}; + // indexers along repeated axis + Strided1DIndexer src_axis_indexer{0, src_axis_shape, src_axis_stride}; + Strided1DIndexer dst_axis_indexer{0, dst_axis_shape, dst_axis_stride}; + + const size_t gws = orthog_nelems * dst_axis_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatScalarFunctor( + src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer, + src_axis_indexer, dst_axis_indexer)); + }); + + return repeat_ev; +} + +template struct RepeatScalarFactory +{ + fnT get() + { + fnT fn = repeat_by_scalar_impl; + return fn; + } +}; + +typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)( + sycl::queue, + size_t, + const char *, + char *, + const py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event repeat_by_scalar_1d_impl(sycl::queue q, + size_t dst_nelems, + const char *src_cp, + char *dst_cp, + const py::ssize_t reps, + py::ssize_t src_shape, + py::ssize_t src_stride, + py::ssize_t dst_shape, + py::ssize_t dst_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + TwoZeroOffsets_Indexer orthog_indexer{}; + // indexers along repeated axis + Strided1DIndexer src_indexer(0, src_shape, src_stride); + Strided1DIndexer dst_indexer{0, dst_shape, dst_stride}; + + const size_t gws = dst_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatScalarFunctor( + src_tp, dst_tp, reps, dst_nelems, orthog_indexer, src_indexer, + dst_indexer)); + }); + + return repeat_ev; +} + +template struct RepeatScalar1DFactory +{ + fnT get() + { + fnT fn = repeat_by_scalar_1d_impl; + return fn; + } +}; + +} // namespace repeat +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp new file mode 100644 index 0000000000..5ce863ad3f --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators.cpp @@ -0,0 +1,352 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/accumulators.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +// Computation of positions of masked elements + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::accumulators::accumulate_contig_impl_fn_ptr_t; +static accumulate_contig_impl_fn_ptr_t + mask_positions_contig_i64_dispatch_vector[td_ns::num_types]; +static accumulate_contig_impl_fn_ptr_t + mask_positions_contig_i32_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + mask_positions_strided_i64_dispatch_vector[td_ns::num_types]; +static accumulate_strided_impl_fn_ptr_t + mask_positions_strided_i32_dispatch_vector[td_ns::num_types]; + +void populate_mask_positions_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::accumulators:: + MaskPositionsContigFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsContigFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsStridedFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsStridedFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb4; + dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector); + + return; +} + +size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, + dpctl::tensor::usm_ndarray cumsum, + sycl::queue exec_q, + std::vector const &depends) +{ + // cumsum is 1D + if (cumsum.get_ndim() != 1) { + throw py::value_error("Result array must be one-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array must be C-contiguous."); + } + + // cumsum.shape == (mask.size,) + auto mask_size = mask.get_size(); + auto cumsum_size = cumsum.get_shape(0); + if (cumsum_size != mask_size) { + throw py::value_error("Inconsistent dimensions"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) { + // FIXME: use ExecutionPlacementError + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (mask_size == 0) { + return 0; + } + + int mask_typenum = mask.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + // mask can be any type + const char *mask_data = mask.get_data(); + char *cumsum_data = cumsum.get_data(); + + auto const &array_types = td_ns::usm_ndarray_types(); + + int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // cumsum must be int32_t/int64_t only + constexpr int int32_typeid = static_cast(td_ns::typenum_t::INT32); + constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array must have int32 or int64 data-type."); + } + + const bool use_i32 = (cumsum_typeid == int32_typeid); + + if (mask.is_c_contiguous()) { + auto fn = (use_i32) + ? mask_positions_contig_i32_dispatch_vector[mask_typeid] + : mask_positions_contig_i64_dispatch_vector[mask_typeid]; + + return fn(exec_q, mask_size, mask_data, cumsum_data, depends); + } + + const py::ssize_t *shape = mask.get_shape_raw(); + auto const &strides_vector = mask.get_strides_vector(); + + using shT = std::vector; + shT compact_shape; + shT compact_strides; + + int mask_nd = mask.get_ndim(); + int nd = mask_nd; + + dpctl::tensor::py_internal::compact_iteration_space( + nd, shape, strides_vector, compact_shape, compact_strides); + + // Strided implementation + auto strided_fn = + (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid] + : mask_positions_strided_i64_dispatch_vector[mask_typeid]; + std::vector host_task_events; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, compact_shape, compact_strides); + py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); + if (shape_strides == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Unexpected error"); + } + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + copy_shape_ev.wait(); + sycl::event::wait(host_task_events); + sycl::free(shape_strides, exec_q); + throw std::runtime_error("Unexpected error"); + } + + std::vector dependent_events; + dependent_events.reserve(depends.size() + 1); + dependent_events.insert(dependent_events.end(), copy_shape_ev); + dependent_events.insert(dependent_events.end(), depends.begin(), + depends.end()); + + size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, + shape_strides, cumsum_data, dependent_events); + + sycl::event::wait(host_task_events); + sycl::free(shape_strides, exec_q); + + return total_set; +} + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumsum_1d_strided_dispatch_vector[td_ns::num_types]; +using dpctl::tensor::kernels::accumulators::accumulate_contig_impl_fn_ptr_t; +static accumulate_contig_impl_fn_ptr_t + cumsum_1d_contig_dispatch_vector[td_ns::num_types]; + +void populate_cumsum_1d_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector); + + using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector); + + return; +} + +size_t py_cumsum_1d(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray cumsum, + sycl::queue exec_q, + std::vector const &depends) +{ + // cumsum is 1D + if (cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be one-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + // cumsum.shape == (src.size,) + auto src_size = src.get_size(); + auto cumsum_size = cumsum.get_shape(0); + if (cumsum_size != src_size) { + throw py::value_error("Inconsistent dimensions"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum})) { + // FIXME: use ExecutionPlacementError + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (src_size == 0) { + return 0; + } + + int src_typenum = src.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + // src can be any type + const char *src_data = src.get_data(); + char *cumsum_data = cumsum.get_data(); + + auto const &array_types = td_ns::usm_ndarray_types(); + + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // this cumsum must be int64_t only + constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array must have int64 data-type."); + } + + if (src.is_c_contiguous()) { + auto fn = cumsum_1d_contig_dispatch_vector[src_typeid]; + if (fn == nullptr) { + throw std::runtime_error( + "this cumsum requires integer type, got src_typeid=" + + std::to_string(src_typeid)); + } + return fn(exec_q, src_size, src_data, cumsum_data, depends); + } + + const py::ssize_t *shape = src.get_shape_raw(); + auto const &strides_vector = src.get_strides_vector(); + + using shT = std::vector; + shT compact_shape; + shT compact_strides; + + int src_nd = src.get_ndim(); + int nd = src_nd; + + dpctl::tensor::py_internal::compact_iteration_space( + nd, shape, strides_vector, compact_shape, compact_strides); + + // Strided implementation + auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error( + "this cumsum requires integer type, got src_typeid=" + + std::to_string(src_typeid)); + } + std::vector host_task_events; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, compact_shape, compact_strides); + py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); + if (shape_strides == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Unexpected error"); + } + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + copy_shape_ev.wait(); + sycl::event::wait(host_task_events); + sycl::free(shape_strides, exec_q); + throw std::runtime_error("Unexpected error"); + } + + std::vector dependent_events; + dependent_events.reserve(depends.size() + 1); + dependent_events.insert(dependent_events.end(), copy_shape_ev); + dependent_events.insert(dependent_events.end(), depends.begin(), + depends.end()); + + size_t total = strided_fn(exec_q, src_size, src_data, nd, shape_strides, + cumsum_data, dependent_events); + + sycl::event::wait(host_task_events); + sycl::free(shape_strides, exec_q); + + return total; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/accumulators.hpp b/dpctl/tensor/libtensor/source/accumulators.hpp new file mode 100644 index 0000000000..f4e5ce9d84 --- /dev/null +++ b/dpctl/tensor/libtensor/source/accumulators.hpp @@ -0,0 +1,56 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void populate_mask_positions_dispatch_vectors(void); + +extern size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, + dpctl::tensor::usm_ndarray cumsum, + sycl::queue exec_q, + std::vector const &depends = {}); + +extern void populate_cumsum_1d_dispatch_vectors(void); + +extern size_t py_cumsum_1d(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray cumsum, + sycl::queue exec_q, + std::vector const &depends = {}); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp index d37967daef..43ab92b86d 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -46,214 +46,10 @@ namespace tensor namespace py_internal { -/* @brief Split shape/strides into dir1 (complementary to axis_start <= i < - * axis_end) and dir2 (along given set of axes) - */ -template -void _split_iteration_space(const shT &shape_vec, - const shT &strides_vec, - int axis_start, - int axis_end, - shT &dir1_shape_vec, - shT &dir2_shape_vec, - shT &dir1_strides_vec, - shT &dir2_strides_vec) -{ - int nd = static_cast(shape_vec.size()); - int dir2_sz = axis_end - axis_start; - int dir1_sz = nd - dir2_sz; - - assert(dir1_sz > 0); - assert(dir2_sz > 0); - - dir1_shape_vec.resize(dir1_sz); - dir2_shape_vec.resize(dir2_sz); - - std::copy(shape_vec.begin(), shape_vec.begin() + axis_start, - dir1_shape_vec.begin()); - std::copy(shape_vec.begin() + axis_end, shape_vec.end(), - dir1_shape_vec.begin() + axis_start); - - std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end, - dir2_shape_vec.begin()); - - dir1_strides_vec.resize(dir1_sz); - dir2_strides_vec.resize(dir2_sz); - - std::copy(strides_vec.begin(), strides_vec.begin() + axis_start, - dir1_strides_vec.begin()); - std::copy(strides_vec.begin() + axis_end, strides_vec.end(), - dir1_strides_vec.begin() + axis_start); - - std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end, - dir2_strides_vec.begin()); - - return; -} - -// Computation of positions of masked elements +// Masked extraction namespace td_ns = dpctl::tensor::type_dispatch; -using dpctl::tensor::kernels::indexing::mask_positions_contig_impl_fn_ptr_t; -static mask_positions_contig_impl_fn_ptr_t - mask_positions_contig_i64_dispatch_vector[td_ns::num_types]; -static mask_positions_contig_impl_fn_ptr_t - mask_positions_contig_i32_dispatch_vector[td_ns::num_types]; - -using dpctl::tensor::kernels::indexing::mask_positions_strided_impl_fn_ptr_t; -static mask_positions_strided_impl_fn_ptr_t - mask_positions_strided_i64_dispatch_vector[td_ns::num_types]; -static mask_positions_strided_impl_fn_ptr_t - mask_positions_strided_i32_dispatch_vector[td_ns::num_types]; - -void populate_mask_positions_dispatch_vectors(void) -{ - using dpctl::tensor::kernels::indexing::MaskPositionsContigFactoryForInt64; - td_ns::DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector); - - using dpctl::tensor::kernels::indexing::MaskPositionsContigFactoryForInt32; - td_ns::DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector); - - using dpctl::tensor::kernels::indexing::MaskPositionsStridedFactoryForInt64; - td_ns::DispatchVectorBuilder - dvb3; - dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector); - - using dpctl::tensor::kernels::indexing::MaskPositionsStridedFactoryForInt32; - td_ns::DispatchVectorBuilder - dvb4; - dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector); - - return; -} - -size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, - dpctl::tensor::usm_ndarray cumsum, - sycl::queue exec_q, - std::vector const &depends) -{ - // cumsum is 1D - if (cumsum.get_ndim() != 1) { - throw py::value_error("Result array must be one-dimensional."); - } - - if (!cumsum.is_c_contiguous()) { - throw py::value_error("Expecting `cumsum` array must be C-contiguous."); - } - - // cumsum.shape == (mask.size,) - auto mask_size = mask.get_size(); - auto cumsum_size = cumsum.get_shape(0); - if (cumsum_size != mask_size) { - throw py::value_error("Inconsistent dimensions"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) { - // FIXME: use ExecutionPlacementError - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - if (mask_size == 0) { - return 0; - } - - int mask_typenum = mask.get_typenum(); - int cumsum_typenum = cumsum.get_typenum(); - - // mask can be any type - const char *mask_data = mask.get_data(); - char *cumsum_data = cumsum.get_data(); - - auto const &array_types = td_ns::usm_ndarray_types(); - - int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum); - int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); - - // cumsum must be int32_t/int64_t only - constexpr int int32_typeid = static_cast(td_ns::typenum_t::INT32); - constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); - if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { - throw py::value_error( - "Cumulative sum array must have int32 or int64 data-type."); - } - - const bool use_i32 = (cumsum_typeid == int32_typeid); - - if (mask.is_c_contiguous()) { - auto fn = (use_i32) - ? mask_positions_contig_i32_dispatch_vector[mask_typeid] - : mask_positions_contig_i64_dispatch_vector[mask_typeid]; - - return fn(exec_q, mask_size, mask_data, cumsum_data, depends); - } - - const py::ssize_t *shape = mask.get_shape_raw(); - auto const &strides_vector = mask.get_strides_vector(); - - using shT = std::vector; - shT compact_shape; - shT compact_strides; - - int mask_nd = mask.get_ndim(); - int nd = mask_nd; - - dpctl::tensor::py_internal::compact_iteration_space( - nd, shape, strides_vector, compact_shape, compact_strides); - - // Strided implementation - auto strided_fn = - (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid] - : mask_positions_strided_i64_dispatch_vector[mask_typeid]; - std::vector host_task_events; - - using dpctl::tensor::offset_utils::device_allocate_and_pack; - const auto &ptr_size_event_tuple = device_allocate_and_pack( - exec_q, host_task_events, compact_shape, compact_strides); - py::ssize_t *shape_strides = std::get<0>(ptr_size_event_tuple); - if (shape_strides == nullptr) { - sycl::event::wait(host_task_events); - throw std::runtime_error("Unexpected error"); - } - sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); - - if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { - copy_shape_ev.wait(); - sycl::event::wait(host_task_events); - sycl::free(shape_strides, exec_q); - throw std::runtime_error("Unexpected error"); - } - - std::vector dependent_events; - dependent_events.reserve(depends.size() + 1); - dependent_events.insert(dependent_events.end(), copy_shape_ev); - dependent_events.insert(dependent_events.end(), depends.begin(), - depends.end()); - - size_t total_set = strided_fn(exec_q, mask_size, mask_data, nd, - shape_strides, cumsum_data, dependent_events); - - sycl::event::wait(host_task_events); - sycl::free(shape_strides, exec_q); - - return total_set; -} - -// Masked extraction - using dpctl::tensor::kernels::indexing:: masked_extract_all_slices_strided_impl_fn_ptr_t; @@ -493,19 +289,21 @@ py_extract(dpctl::tensor::usm_ndarray src, shT masked_src_shape; shT ortho_src_strides; shT masked_src_strides; - _split_iteration_space(src_shape_vec, src_strides_vec, axis_start, - axis_end, ortho_src_shape, - masked_src_shape, // 4 vectors modified - ortho_src_strides, masked_src_strides); + dpctl::tensor::py_internal::split_iteration_space( + src_shape_vec, src_strides_vec, axis_start, axis_end, + ortho_src_shape, + masked_src_shape, // 4 vectors modified + ortho_src_strides, masked_src_strides); shT ortho_dst_shape; shT masked_dst_shape; shT ortho_dst_strides; shT masked_dst_strides; - _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start, - axis_start + 1, ortho_dst_shape, - masked_dst_shape, // 4 vectors modified - ortho_dst_strides, masked_dst_strides); + dpctl::tensor::py_internal::split_iteration_space( + dst_shape_vec, dst_strides_vec, axis_start, axis_start + 1, + ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); assert(ortho_src_shape.size() == static_cast(ortho_nd)); assert(ortho_dst_shape.size() == static_cast(ortho_nd)); @@ -821,19 +619,21 @@ py_place(dpctl::tensor::usm_ndarray dst, shT masked_dst_shape; shT ortho_dst_strides; shT masked_dst_strides; - _split_iteration_space(dst_shape_vec, dst_strides_vec, axis_start, - axis_end, ortho_dst_shape, - masked_dst_shape, // 4 vectors modified - ortho_dst_strides, masked_dst_strides); + dpctl::tensor::py_internal::split_iteration_space( + dst_shape_vec, dst_strides_vec, axis_start, axis_end, + ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); shT ortho_rhs_shape; shT masked_rhs_shape; shT ortho_rhs_strides; shT masked_rhs_strides; - _split_iteration_space(rhs_shape_vec, rhs_strides_vec, axis_start, - axis_start + 1, ortho_rhs_shape, - masked_rhs_shape, // 4 vectors modified - ortho_rhs_strides, masked_rhs_strides); + dpctl::tensor::py_internal::split_iteration_space( + rhs_shape_vec, rhs_strides_vec, axis_start, axis_start + 1, + ortho_rhs_shape, + masked_rhs_shape, // 4 vectors modified + ortho_rhs_strides, masked_rhs_strides); assert(ortho_dst_shape.size() == static_cast(ortho_nd)); assert(ortho_rhs_shape.size() == static_cast(ortho_nd)); diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp index 647d78bbbb..5ce868894a 100644 --- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp @@ -38,13 +38,6 @@ namespace tensor namespace py_internal { -extern void populate_mask_positions_dispatch_vectors(void); - -extern size_t py_mask_positions(dpctl::tensor::usm_ndarray mask, - dpctl::tensor::usm_ndarray cumsum, - sycl::queue exec_q, - std::vector const &depends = {}); - extern std::pair py_extract(dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray cumsum, diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.cpp b/dpctl/tensor/libtensor/source/boolean_reductions.cpp index 5def6c5158..db07d05c73 100644 --- a/dpctl/tensor/libtensor/source/boolean_reductions.cpp +++ b/dpctl/tensor/libtensor/source/boolean_reductions.cpp @@ -58,7 +58,9 @@ using dpctl::tensor::kernels::boolean_reduction_strided_impl_fn_ptr; static boolean_reduction_strided_impl_fn_ptr all_reduction_strided_dispatch_vector[td_ns::num_types]; static boolean_reduction_contig_impl_fn_ptr - all_reduction_contig_dispatch_vector[td_ns::num_types]; + all_reduction_axis1_contig_dispatch_vector[td_ns::num_types]; +static boolean_reduction_contig_impl_fn_ptr + all_reduction_axis0_contig_dispatch_vector[td_ns::num_types]; void populate_all_dispatch_vectors(void) { @@ -74,11 +76,19 @@ void populate_all_dispatch_vectors(void) using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::AllContigFactory; + using dpctl::tensor::kernels::AllAxis1ContigFactory; DispatchVectorBuilder + AllAxis1ContigFactory, td_ns::num_types> all_dvb2; - all_dvb2.populate_dispatch_vector(all_reduction_contig_dispatch_vector); + all_dvb2.populate_dispatch_vector( + all_reduction_axis1_contig_dispatch_vector); + + using dpctl::tensor::kernels::AllAxis0ContigFactory; + DispatchVectorBuilder + all_dvb3; + all_dvb3.populate_dispatch_vector( + all_reduction_axis0_contig_dispatch_vector); }; } // namespace impl @@ -91,7 +101,9 @@ static boolean_reduction_strided_impl_fn_ptr any_reduction_strided_dispatch_vector[td_ns::num_types]; using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr; static boolean_reduction_contig_impl_fn_ptr - any_reduction_contig_dispatch_vector[td_ns::num_types]; + any_reduction_axis1_contig_dispatch_vector[td_ns::num_types]; +static boolean_reduction_contig_impl_fn_ptr + any_reduction_axis0_contig_dispatch_vector[td_ns::num_types]; void populate_any_dispatch_vectors(void) { @@ -107,11 +119,19 @@ void populate_any_dispatch_vectors(void) using dpctl::tensor::kernels::boolean_reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::AnyContigFactory; + using dpctl::tensor::kernels::AnyAxis1ContigFactory; DispatchVectorBuilder + AnyAxis1ContigFactory, td_ns::num_types> any_dvb2; - any_dvb2.populate_dispatch_vector(any_reduction_contig_dispatch_vector); + any_dvb2.populate_dispatch_vector( + any_reduction_axis1_contig_dispatch_vector); + + using dpctl::tensor::kernels::AnyAxis0ContigFactory; + DispatchVectorBuilder + any_dvb3; + any_dvb3.populate_dispatch_vector( + any_reduction_axis0_contig_dispatch_vector); }; } // namespace impl @@ -124,16 +144,18 @@ void init_boolean_reduction_functions(py::module_ m) // ALL { impl::populate_all_dispatch_vectors(); - using impl::all_reduction_contig_dispatch_vector; + using impl::all_reduction_axis0_contig_dispatch_vector; + using impl::all_reduction_axis1_contig_dispatch_vector; using impl::all_reduction_strided_dispatch_vector; auto all_pyapi = [&](arrayT src, int trailing_dims_to_reduce, arrayT dst, sycl::queue exec_q, const event_vecT &depends = {}) { - return py_boolean_reduction(src, trailing_dims_to_reduce, dst, - exec_q, depends, - all_reduction_contig_dispatch_vector, - all_reduction_strided_dispatch_vector); + return py_boolean_reduction( + src, trailing_dims_to_reduce, dst, exec_q, depends, + all_reduction_axis1_contig_dispatch_vector, + all_reduction_axis0_contig_dispatch_vector, + all_reduction_strided_dispatch_vector); }; m.def("_all", all_pyapi, "", py::arg("src"), py::arg("trailing_dims_to_reduce"), py::arg("dst"), @@ -143,16 +165,18 @@ void init_boolean_reduction_functions(py::module_ m) // ANY { impl::populate_any_dispatch_vectors(); - using impl::any_reduction_contig_dispatch_vector; + using impl::any_reduction_axis0_contig_dispatch_vector; + using impl::any_reduction_axis1_contig_dispatch_vector; using impl::any_reduction_strided_dispatch_vector; auto any_pyapi = [&](arrayT src, int trailing_dims_to_reduce, arrayT dst, sycl::queue exec_q, const event_vecT &depends = {}) { - return py_boolean_reduction(src, trailing_dims_to_reduce, dst, - exec_q, depends, - any_reduction_contig_dispatch_vector, - any_reduction_strided_dispatch_vector); + return py_boolean_reduction( + src, trailing_dims_to_reduce, dst, exec_q, depends, + any_reduction_axis1_contig_dispatch_vector, + any_reduction_axis0_contig_dispatch_vector, + any_reduction_strided_dispatch_vector); }; m.def("_any", any_pyapi, "", py::arg("src"), py::arg("trailing_dims_to_reduce"), py::arg("dst"), diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.hpp b/dpctl/tensor/libtensor/source/boolean_reductions.hpp index 3d5970d783..591439a7c9 100644 --- a/dpctl/tensor/libtensor/source/boolean_reductions.hpp +++ b/dpctl/tensor/libtensor/source/boolean_reductions.hpp @@ -54,7 +54,8 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src, dpctl::tensor::usm_ndarray dst, sycl::queue exec_q, const std::vector &depends, - const contig_dispatchT &contig_dispatch_vector, + const contig_dispatchT &axis1_contig_dispatch_vector, + const contig_dispatchT &axis0_contig_dispatch_vector, const strided_dispatchT &strided_dispatch_vector) { int src_nd = src.get_ndim(); @@ -131,16 +132,32 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src, bool is_src_c_contig = src.is_c_contiguous(); bool is_src_f_contig = src.is_f_contiguous(); - bool is_dst_c_contig = dst.is_c_contiguous(); if ((is_src_c_contig && is_dst_c_contig) || - (is_src_f_contig && dst_nd == 0)) { - auto fn = contig_dispatch_vector[src_typeid]; + (is_src_f_contig && dst_nelems == 0)) + { + auto fn = axis1_contig_dispatch_vector[src_typeid]; constexpr py::ssize_t zero_offset = 0; - auto red_ev = fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, - zero_offset, zero_offset, zero_offset, depends); + sycl::event red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, + zero_offset, zero_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = axis0_contig_dispatch_vector[src_typeid]; + constexpr py::ssize_t zero_offset = 0; + + sycl::event red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, + zero_offset, zero_offset, depends); sycl::event keep_args_event = dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); @@ -196,24 +213,48 @@ py_boolean_reduction(dpctl::tensor::usm_ndarray src, simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); } - if ((simplified_red_nd == 1) && (simplified_red_src_strides[0] == 1) && - (iter_nd == 1) && - ((simplified_iter_shape[0] == 1) || - ((simplified_iter_dst_strides[0] == 1) && - (simplified_iter_src_strides[0] == - static_cast(red_nelems))))) - { - auto fn = contig_dispatch_vector[src_typeid]; + if (simplified_red_nd == 1 && iter_nd == 1) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; size_t iter_nelems = dst_nelems; - sycl::event red_ev = - fn(exec_q, iter_nelems, red_nelems, src.get_data(), dst.get_data(), - iter_src_offset, iter_dst_offset, red_src_offset, depends); + if (simplified_red_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iter_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iter_dst_strides[0] == 1) && + (static_cast(simplified_iter_src_strides[0]) == + red_nelems); + } + else if (static_cast(simplified_red_src_strides[0]) == + iter_nelems) { + mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) && + (simplified_iter_src_strides[0] == 1); + } + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_contig_dispatch_vector[src_typeid]; - sycl::event keep_args_event = - dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + sycl::event red_ev = + fn(exec_q, iter_nelems, red_nelems, src_data, dst_data, + iter_src_offset, iter_dst_offset, red_src_offset, depends); - return std::make_pair(keep_args_event, red_ev); + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_contig_dispatch_vector[src_typeid]; + + sycl::event red_ev = + fn(exec_q, iter_nelems, red_nelems, src_data, dst_data, + iter_src_offset, iter_dst_offset, red_src_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } } auto fn = strided_dispatch_vector[src_typeid]; diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp new file mode 100644 index 0000000000..4d6ef27a81 --- /dev/null +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -0,0 +1,559 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/repeat.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::repeat::repeat_by_sequence_fn_ptr_t; +static repeat_by_sequence_fn_ptr_t + repeat_by_sequence_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_sequence_1d_fn_ptr_t; +static repeat_by_sequence_1d_fn_ptr_t + repeat_by_sequence_1d_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_scalar_fn_ptr_t; +static repeat_by_scalar_fn_ptr_t + repeat_by_scalar_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_scalar_1d_fn_ptr_t; +static repeat_by_scalar_1d_fn_ptr_t + repeat_by_scalar_1d_dispatch_vector[td_ns::num_types]; + +void init_repeat_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::repeat::RepeatSequenceFactory; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(repeat_by_sequence_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatSequence1DFactory; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(repeat_by_sequence_1d_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatScalarFactory; + td_ns::DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(repeat_by_scalar_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatScalar1DFactory; + td_ns::DispatchVectorBuilder + dvb4; + dvb4.populate_dispatch_vector(repeat_by_scalar_1d_dispatch_vector); +} + +std::pair +py_repeat_by_sequence(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + dpctl::tensor::usm_ndarray reps, + dpctl::tensor::usm_ndarray cumsum, + int axis, + sycl::queue exec_q, + std::vector const &depends) +{ + int src_nd = src.get_ndim(); + if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) || + (axis > 0 && src_nd == 0)) { + throw py::value_error("Specified axis is invalid."); + } + + int dst_nd = dst.get_ndim(); + if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + int reps_nd = reps.get_ndim(); + if (reps_nd != 1) { + throw py::value_error("`reps` array must be 1-dimensional"); + } + + if (cumsum.get_ndim() != 1) { + throw py::value_error("`cumsum` array must be 1-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t reps_sz = reps.get_size(); + size_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_orthog_dims(true); + size_t orthog_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis + 1; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + + size_t src_axis_nelems(1); + if (src_nd > 0) { + src_axis_nelems = src_shape[axis]; + } + size_t dst_axis_nelems(dst_shape[axis]); + + // shape at repeated axis must be equal to the sum of reps + if (!same_orthog_dims || src_axis_nelems != reps_sz || + src_axis_nelems != cumsum_sz) + { + throw py::value_error("Inconsistent array dimensions"); + } + + if (orthog_nelems == 0 || src_axis_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(orthog_nelems * dst_axis_nelems)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src or reps + if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int reps_typenum = reps.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexpected data type of `cumsum` array, expecting " + "'int64'"); + } + + if (reps_typeid != cumsum_typeid) { + throw py::value_error("`reps` array must have the same elemental " + "data type as cumsum"); + } + + const char *src_data_p = src.get_data(); + const char *reps_data_p = reps.get_data(); + const char *cumsum_data_p = cumsum.get_data(); + char *dst_data_p = dst.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto reps_shape_vec = reps.get_shape_vector(); + auto reps_strides_vec = reps.get_strides_vector(); + + sycl::event repeat_ev; + std::vector host_task_events{}; + if (axis == 0 && src_nd < 2) { + // empty orthogonal directions + + auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; + + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + py::ssize_t src_shape(0); + py::ssize_t src_stride(0); + if (src_nd > 0) { + src_shape = src_shape_vec[0]; + src_stride = src_strides_vec[0]; + } + + sycl::event repeat_ev = + fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p, + cumsum_data_p, src_shape, src_stride, dst_shape_vec[0], + dst_strides_vec[0], reps_shape_vec[0], reps_strides_vec[0], + depends); + } + else { + // non-empty othogonal directions + + auto fn = repeat_by_sequence_dispatch_vector[src_typeid]; + + int orthog_nd = src_nd - 1; + + using shT = std::vector; + shT orthog_src_shape; + shT orthog_src_strides; + shT axis_src_shape; + shT axis_src_stride; + dpctl::tensor::py_internal::split_iteration_space( + src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape, + axis_src_shape, orthog_src_strides, axis_src_stride); + + shT orthog_dst_shape; + shT orthog_dst_strides; + shT axis_dst_shape; + shT axis_dst_stride; + dpctl::tensor::py_internal::split_iteration_space( + dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape, + axis_dst_shape, orthog_dst_strides, axis_dst_stride); + + assert(orthog_src_shape.size() == static_cast(orthog_nd)); + assert(orthog_dst_shape.size() == static_cast(orthog_nd)); + assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), + orthog_dst_shape.begin())); + + std::vector simplified_orthog_shape; + std::vector simplified_orthog_src_strides; + std::vector simplified_orthog_dst_strides; + + const py::ssize_t *_shape = orthog_src_shape.data(); + + py::ssize_t orthog_src_offset(0); + py::ssize_t orthog_dst_offset(0); + dpctl::tensor::py_internal::simplify_iteration_space( + orthog_nd, _shape, orthog_src_strides, orthog_dst_strides, + // output + simplified_orthog_shape, simplified_orthog_src_strides, + simplified_orthog_dst_strides, orthog_src_offset, + orthog_dst_offset); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = + device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); + if (packed_shapes_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, orthog_nelems, src_axis_nelems, src_data_p, + dst_data_p, reps_data_p, cumsum_data_p, + // data to build orthog indexer + orthog_nd, packed_shapes_strides, orthog_src_offset, + orthog_dst_offset, + // data to build indexers along repeated axis in src + axis_src_shape[0], axis_src_stride[0], + // data to build indexer along repeated axis in dst + axis_dst_shape[0], axis_dst_stride[0], + // data to build indexer for reps array + reps_shape_vec[0], reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_shapes_strides] { + sycl::free(packed_shapes_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + host_task_events.push_back(repeat_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, reps, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +std::pair +py_repeat_by_scalar(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + const py::ssize_t reps, + int axis, + sycl::queue exec_q, + std::vector const &depends) +{ + int src_nd = src.get_ndim(); + if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) || + (axis > 0 && src_nd == 0)) { + throw py::value_error("Specified axis is invalid."); + } + + int dst_nd = dst.get_ndim(); + if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_orthog_dims(true); + size_t orthog_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis + 1; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + + size_t src_axis_nelems(1); + if (src_nd > 0) { + src_axis_nelems = src_shape[axis]; + } + size_t dst_axis_nelems(dst_shape[axis]); + + // shape at repeated axis must be equal to the shape of src at the axis * + // reps + if (!same_orthog_dims || (src_axis_nelems * reps) != dst_axis_nelems) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (orthog_nelems == 0 || src_axis_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < + static_cast(orthog_nelems * (src_axis_nelems * reps))) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src + if (overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + const char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + sycl::event repeat_ev; + std::vector host_task_events{}; + if (axis == 0 && src_nd < 2) { + // empty orthogonal directions + + auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; + + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + py::ssize_t src_shape(0); + py::ssize_t src_stride(0); + if (src_nd > 0) { + src_shape = src_shape_vec[0]; + src_stride = src_strides_vec[0]; + } + sycl::event repeat_ev = + fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, src_shape, + src_stride, dst_shape_vec[0], dst_strides_vec[0], depends); + } + else { + // non-empty othogonal directions + + auto fn = repeat_by_scalar_dispatch_vector[src_typeid]; + + int orthog_nd = src_nd - 1; + + using shT = std::vector; + shT orthog_src_shape; + shT orthog_src_strides; + shT axis_src_shape; + shT axis_src_stride; + dpctl::tensor::py_internal::split_iteration_space( + src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape, + axis_src_shape, orthog_src_strides, axis_src_stride); + + shT orthog_dst_shape; + shT orthog_dst_strides; + shT axis_dst_shape; + shT axis_dst_stride; + dpctl::tensor::py_internal::split_iteration_space( + dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape, + axis_dst_shape, orthog_dst_strides, axis_dst_stride); + + assert(orthog_src_shape.size() == static_cast(orthog_nd)); + assert(orthog_dst_shape.size() == static_cast(orthog_nd)); + assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), + orthog_dst_shape.begin())); + + std::vector simplified_orthog_shape; + std::vector simplified_orthog_src_strides; + std::vector simplified_orthog_dst_strides; + + const py::ssize_t *_shape = orthog_src_shape.data(); + + py::ssize_t orthog_src_offset(0); + py::ssize_t orthog_dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space( + orthog_nd, _shape, orthog_src_strides, orthog_dst_strides, + // output + simplified_orthog_shape, simplified_orthog_src_strides, + simplified_orthog_dst_strides, orthog_src_offset, + orthog_dst_offset); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = + device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + py::ssize_t *packed_shapes_strides = std::get<0>(ptr_size_event_tuple1); + if (packed_shapes_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, orthog_nelems, dst_axis_nelems, src_data_p, + dst_data_p, reps, + // data to build orthog indexer + orthog_nd, packed_shapes_strides, orthog_src_offset, + orthog_dst_offset, + // data to build indexer along repeated axis in src + axis_src_shape[0], axis_src_stride[0], + // data to build indexer along repeated axis in dst + axis_dst_shape[0], axis_dst_stride[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_shapes_strides] { + sycl::free(packed_shapes_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + host_task_events.push_back(repeat_ev); + + sycl::event py_obj_management_host_task_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/repeat.hpp b/dpctl/tensor/libtensor/source/repeat.hpp new file mode 100644 index 0000000000..e7ec59f209 --- /dev/null +++ b/dpctl/tensor/libtensor/source/repeat.hpp @@ -0,0 +1,61 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_repeat_dispatch_vectors(void); + +extern std::pair +py_repeat_by_sequence(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + dpctl::tensor::usm_ndarray reps, + dpctl::tensor::usm_ndarray cumsum, + int axis, + sycl::queue exec_q, + std::vector const &depends); + +extern std::pair +py_repeat_by_scalar(dpctl::tensor::usm_ndarray src, + dpctl::tensor::usm_ndarray dst, + const py::ssize_t reps, + int axis, + sycl::queue exec_q, + std::vector const &depends); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp index 31f5250f8a..e77bbbb5ce 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.cpp @@ -408,6 +408,50 @@ void compact_iteration_space(int &nd, } } +/* @brief Split shape/strides into dir1 (complementary to axis_start <= i < + * axis_end) and dir2 (along given set of axes) + */ +void split_iteration_space(const std::vector &shape_vec, + const std::vector &strides_vec, + int axis_start, + int axis_end, + std::vector &dir1_shape_vec, + std::vector &dir2_shape_vec, + std::vector &dir1_strides_vec, + std::vector &dir2_strides_vec) +{ + int nd = static_cast(shape_vec.size()); + int dir2_sz = axis_end - axis_start; + int dir1_sz = nd - dir2_sz; + + assert(dir1_sz > 0); + assert(dir2_sz > 0); + + dir1_shape_vec.resize(dir1_sz); + dir2_shape_vec.resize(dir2_sz); + + std::copy(shape_vec.begin(), shape_vec.begin() + axis_start, + dir1_shape_vec.begin()); + std::copy(shape_vec.begin() + axis_end, shape_vec.end(), + dir1_shape_vec.begin() + axis_start); + + std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end, + dir2_shape_vec.begin()); + + dir1_strides_vec.resize(dir1_sz); + dir2_strides_vec.resize(dir2_sz); + + std::copy(strides_vec.begin(), strides_vec.begin() + axis_start, + dir1_strides_vec.begin()); + std::copy(strides_vec.begin() + axis_end, strides_vec.end(), + dir1_strides_vec.begin() + axis_start); + + std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end, + dir2_strides_vec.begin()); + + return; +} + py::ssize_t _ravel_multi_index_c(std::vector const &mi, std::vector const &shape) { diff --git a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp index 275129ad5c..74a80c98b0 100644 --- a/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl/tensor/libtensor/source/simplify_iteration_space.hpp @@ -97,6 +97,16 @@ void compact_iteration_space(int &, std::vector &, std::vector &); +void split_iteration_space(const std::vector &, + const std::vector &, + int, + int, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &); + py::ssize_t _ravel_multi_index_c(std::vector const &, std::vector const &); py::ssize_t _ravel_multi_index_f(std::vector const &, diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 691a56b11f..fec246325c 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -33,6 +33,7 @@ #include "dpctl4pybind11.hpp" +#include "accumulators.hpp" #include "boolean_advanced_indexing.hpp" #include "boolean_reductions.hpp" #include "copy_and_cast_usm_to_usm.hpp" @@ -45,6 +46,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "linear_sequences.hpp" +#include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "sum_reductions.hpp" #include "triul_ctor.hpp" @@ -96,6 +98,11 @@ using dpctl::tensor::py_internal::py_mask_positions; using dpctl::tensor::py_internal::py_nonzero; using dpctl::tensor::py_internal::py_place; +/* ================= Repeat ====================*/ +using dpctl::tensor::py_internal::py_cumsum_1d; +using dpctl::tensor::py_internal::py_repeat_by_scalar; +using dpctl::tensor::py_internal::py_repeat_by_sequence; + /* ================ Eye ================== */ using dpctl::tensor::py_internal::usm_ndarray_eye; @@ -132,10 +139,14 @@ void init_dispatch_vectors(void) init_eye_ctor_dispatch_vectors(); init_triul_ctor_dispatch_vectors(); - populate_mask_positions_dispatch_vectors(); populate_masked_extract_dispatch_vectors(); populate_masked_place_dispatch_vectors(); + populate_mask_positions_dispatch_vectors(); + + populate_cumsum_1d_dispatch_vectors(); + init_repeat_dispatch_vectors(); + return; } @@ -353,6 +364,9 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("cumsum"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); @@ -387,6 +401,14 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_repeat_by_sequence", &py_repeat_by_sequence, "", py::arg("src"), + py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_repeat_by_scalar", &py_repeat_by_scalar, "", py::arg("src"), + py::arg("dst"), py::arg("reps"), py::arg("axis"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + dpctl::tensor::py_internal::init_elementwise_functions(m); dpctl::tensor::py_internal::init_boolean_reduction_functions(m); dpctl::tensor::py_internal::init_reduction_functions(m); diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py index 6152a15aae..2126727d5b 100644 --- a/dpctl/tests/test_usm_ndarray_manipulation.py +++ b/dpctl/tests/test_usm_ndarray_manipulation.py @@ -17,11 +17,12 @@ import numpy as np import pytest -from helper import get_queue_or_skip from numpy.testing import assert_, assert_array_equal, assert_raises_regex import dpctl import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip +from dpctl.utils import ExecutionPlacementError def test_permute_dims_incorrect_type(): @@ -1116,3 +1117,314 @@ def test_finfo_object(): assert isinstance(fi.dtype, dpt.dtype) assert isinstance(str(fi), str) assert isinstance(repr(fi), str) + + +def test_repeat_scalar_sequence_agreement(): + get_queue_or_skip() + + x = dpt.arange(5, dtype="i4") + expected_res = dpt.empty(10, dtype="i4") + expected_res[1::2], expected_res[::2] = x, x + + # scalar case + reps = 2 + res = dpt.repeat(x, reps) + assert dpt.all(res == expected_res) + + # tuple + reps = (2, 2, 2, 2, 2) + res = dpt.repeat(x, reps) + assert dpt.all(res == expected_res) + + +def test_repeat_as_broadcasting(): + get_queue_or_skip() + + reps = 5 + x = dpt.arange(reps, dtype="i4") + x1 = x[:, dpt.newaxis] + expected_res = dpt.broadcast_to(x1, (reps, reps)) + + res = dpt.repeat(x1, reps, axis=1) + assert dpt.all(res == expected_res) + + x2 = x[dpt.newaxis, :] + expected_res = dpt.broadcast_to(x2, (reps, reps)) + + res = dpt.repeat(x2, reps, axis=0) + assert dpt.all(res == expected_res) + + +def test_repeat_axes(): + get_queue_or_skip() + + reps = 2 + x = dpt.reshape(dpt.arange(5 * 10, dtype="i4"), (5, 10)) + expected_res = dpt.empty((x.shape[0] * 2, x.shape[1]), x.dtype) + expected_res[::2, :], expected_res[1::2] = x, x + res = dpt.repeat(x, reps, axis=0) + assert dpt.all(res == expected_res) + + expected_res = dpt.empty((x.shape[0], x.shape[1] * 2), x.dtype) + expected_res[:, ::2], expected_res[:, 1::2] = x, x + res = dpt.repeat(x, reps, axis=1) + assert dpt.all(res == expected_res) + + +def test_repeat_size_0_outputs(): + get_queue_or_skip() + + x = dpt.ones((3, 0, 5), dtype="i4") + reps = 10 + res = dpt.repeat(x, reps, axis=0) + assert res.size == 0 + assert res.shape == (30, 0, 5) + + res = dpt.repeat(x, reps, axis=1) + assert res.size == 0 + assert res.shape == (3, 0, 5) + + res = dpt.repeat(x, (2, 2, 2), axis=0) + assert res.size == 0 + assert res.shape == (6, 0, 5) + + x = dpt.ones((3, 2, 5)) + res = dpt.repeat(x, 0, axis=1) + assert res.size == 0 + assert res.shape == (3, 0, 5) + + x = dpt.ones((3, 2, 5)) + res = dpt.repeat(x, (0, 0), axis=1) + assert res.size == 0 + assert res.shape == (3, 0, 5) + + +def test_repeat_strides(): + get_queue_or_skip() + + reps = 2 + x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10)) + x1 = x[:, ::-2] + expected_res = dpt.empty((10, 10), dtype="i4") + expected_res[:, ::2], expected_res[:, 1::2] = x1, x1 + res = dpt.repeat(x1, reps, axis=1) + assert dpt.all(res == expected_res) + res = dpt.repeat(x1, (reps,) * x1.shape[1], axis=1) + assert dpt.all(res == expected_res) + + x1 = x[::-2, :] + expected_res = dpt.empty((10, 10), dtype="i4") + expected_res[::2, :], expected_res[1::2, :] = x1, x1 + res = dpt.repeat(x1, reps, axis=0) + assert dpt.all(res == expected_res) + res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0) + assert dpt.all(res == expected_res) + + +def test_repeat_casting(): + get_queue_or_skip() + + x = dpt.arange(5, dtype="i4") + # i4 is cast to i8 + reps = dpt.ones(5, dtype="i4") + res = dpt.repeat(x, reps) + assert res.shape == x.shape + assert dpt.all(res == x) + + +def test_repeat_strided_repeats(): + get_queue_or_skip() + + x = dpt.arange(5, dtype="i4") + reps = dpt.ones(10, dtype="i8") + reps[::2] = 0 + reps = reps[::-2] + res = dpt.repeat(x, reps) + assert res.shape == x.shape + assert dpt.all(res == x) + + +def test_repeat_arg_validation(): + get_queue_or_skip() + + x = dict() + with pytest.raises(TypeError): + dpt.repeat(x, 2) + + # axis must be 0 for scalar + x = dpt.empty(()) + with pytest.raises(ValueError): + dpt.repeat(x, 2, axis=1) + + # x.ndim cannot be > 1 for axis=None + x = dpt.empty((5, 10)) + with pytest.raises(ValueError): + dpt.repeat(x, 2, axis=None) + + # repeats must be positive + x = dpt.empty(5) + with pytest.raises(ValueError): + dpt.repeat(x, -2) + + # repeats must be integers + with pytest.raises(TypeError): + dpt.repeat(x, 2.0) + + # repeats tuple must be the same length as axis + with pytest.raises(ValueError): + dpt.repeat(x, (1, 2)) + + # repeats tuple elements must be positive + with pytest.raises(ValueError): + dpt.repeat(x, (-1,)) + + # repeats must be int or tuple + with pytest.raises(TypeError): + dpt.repeat(x, dict()) + + # repeats array must be 0d or 1d + with pytest.raises(ValueError): + dpt.repeat(x, dpt.ones((1, 1), dtype="i8")) + + # repeats must be castable to i8 + with pytest.raises(TypeError): + dpt.repeat(x, dpt.asarray(2.0, dtype="f4")) + + # compute follows data + q2 = dpctl.SyclQueue() + reps = dpt.asarray(1, dtype="i8", sycl_queue=q2) + with pytest.raises(ExecutionPlacementError): + dpt.repeat(x, reps) + + # repeats array must not contain negative elements + reps = dpt.asarray(-1, dtype="i8") + with pytest.raises(ValueError): + dpt.repeat(x, reps) + reps = dpt.asarray([1, 1, 1, 1, -1], dtype="i8") + with pytest.raises(ValueError): + dpt.repeat(x, reps) + + # repeats must broadcastable to axis size + reps = dpt.arange(10, dtype="i8") + with pytest.raises(ValueError): + dpt.repeat(x, reps) + + +def test_tile_basic(): + get_queue_or_skip() + + reps = 2 + x = dpt.arange(5, dtype="i4") + res = dpt.tile(x, reps) + assert res.shape == (x.shape[0] * reps,) + assert dpt.all(res[: x.size] == res[x.size :]) + + reps = (2, 1) + expected_sh = (2, x.shape[0]) + expected_res = dpt.broadcast_to(x, expected_sh) + res = dpt.tile(x, reps) + assert res.shape == expected_sh + assert dpt.all(expected_res == res) + + +def test_tile_size_1(): + get_queue_or_skip() + + reps = 5 + # test for 0d array + x = dpt.asarray(2, dtype="i4") + res = dpt.tile(x, reps) + assert dpt.all(res == dpt.full(reps, 2, dtype="i4")) + + # test for 1d array with single element + x = dpt.asarray([2], dtype="i4") + res = dpt.tile(x, reps) + assert dpt.all(res == dpt.full(reps, 2, dtype="i4")) + + # test empty reps returns copy of input + reps = () + res = dpt.tile(x, reps) + assert x.shape == res.shape + assert x == res + + +def test_tile_prepends_axes(): + get_queue_or_skip() + + reps = (2,) + x = dpt.ones((5, 10), dtype="i4") + expected_res = dpt.ones((5, 20), dtype="i4") + res = dpt.tile(x, reps) + assert dpt.all(res == expected_res) + + reps = (3, 2, 2) + expected_res = dpt.ones((3, 10, 20), dtype="i4") + res = dpt.tile(x, reps) + assert dpt.all(res == expected_res) + + +def test_tile_empty_outputs(): + get_queue_or_skip() + + x = dpt.asarray((), dtype="i4") + reps = 10 + res = dpt.tile(x, reps) + assert res.size == 0 + assert res.shape == (0,) + + x = dpt.ones((3, 0, 5), dtype="i4") + res = dpt.tile(x, reps) + assert res.size == 0 + assert res.shape == (3, 0, 50) + + reps = (2, 1, 2) + res = dpt.tile(x, reps) + assert res.size == 0 + assert res.shape == (6, 0, 10) + + x = dpt.ones((2, 3, 4), dtype="i4") + reps = (0, 1, 1) + res = dpt.tile(x, reps) + assert res.size == 0 + assert res.shape == (0, 3, 4) + + +def test_tile_strides(): + get_queue_or_skip() + + reps = (1, 2) + x = dpt.reshape(dpt.arange(10 * 10, dtype="i4"), (10, 10)) + x1 = x[:, ::-2] + expected_res = dpt.empty((10, 10), dtype="i4") + expected_res[:, : x1.shape[1]], expected_res[:, x1.shape[1] :] = x1, x1 + res = dpt.tile(x1, reps) + assert dpt.all(res == expected_res) + + reps = (2, 1) + x1 = x[::-2, :] + expected_res = dpt.empty((10, 10), dtype="i4") + expected_res[: x1.shape[0], :], expected_res[x1.shape[0] :, :] = x1, x1 + res = dpt.tile(x1, reps) + assert dpt.all(res == expected_res) + + +def test_tile_size_1_axes(): + get_queue_or_skip() + + reps = (1, 2, 1) + x = dpt.ones((2, 1, 3), dtype="i4") + res = dpt.tile(x, reps) + expected_res = dpt.broadcast_to(x, (2, 2, 3)) + assert dpt.all(res == expected_res) + + +def test_tile_arg_validation(): + get_queue_or_skip() + + with pytest.raises(TypeError): + dpt.tile(dict(), 2) + + # repetitions must be int or tuple + x = dpt.empty(()) + with pytest.raises(TypeError): + dpt.tile(x, dict())