diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index 43e7e8ae90..268c679f00 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -52,32 +52,28 @@ namespace tu_ns = dpctl::tensor::type_utils; template struct FloorDivideFunctor { - - using supports_sg_loadstore = std::negation< - std::disjunction, tu_ns::is_complex>>; - using supports_vec = std::negation< - std::disjunction, tu_ns::is_complex>>; + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; resT operator()(const argT1 &in1, const argT2 &in2) { - auto tmp = in1 / in2; - if constexpr (std::is_integral_v) { - if constexpr (std::is_unsigned_v) { - return (in2 == argT2(0)) ? resT(0) : tmp; + if constexpr (std::is_integral_v || std::is_integral_v) { + if (in2 == argT2(0)) { + return resT(0); + } + if constexpr (std::is_signed_v || std::is_signed_v) { + auto div = in1 / in2; + auto mod = in1 % in2; + auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0)); + return (div - corr); } else { - if (in2 == argT2(0)) { - return resT(0); - } - else { - auto rem = in1 % in2; - auto corr = (rem != 0 && ((rem < 0) != (in2 < 0))); - return (tmp - corr); - } + return (in1 / in2); } } else { - return sycl::floor(tmp); + auto div = in1 / in2; + return (div == resT(0)) ? div : resT(std::floor(div)); } } @@ -85,29 +81,31 @@ struct FloorDivideFunctor sycl::vec operator()(const sycl::vec &in1, const sycl::vec &in2) { - auto tmp = in1 / in2; - using tmpT = typename decltype(tmp)::element_type; - if constexpr (std::is_integral_v) { - if constexpr (std::is_signed_v) { - auto rem_tmp = in1 % in2; + if constexpr (std::is_integral_v) { + sycl::vec res; #pragma unroll - for (int i = 0; i < vec_sz; ++i) { - if (in2[i] == argT2(0)) { - tmp[i] = tmpT(0); - } - else { - tmpT corr = (rem_tmp[i] != 0 && - ((rem_tmp[i] < 0) != (in2[i] < 0))); - tmp[i] -= corr; + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] == argT2(0)) { + res[i] = resT(0); + } + else { + res[i] = in1[i] / in2[i]; + if constexpr (std::is_signed_v) { + auto mod = in1[i] % in2[i]; + auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0)); + res[i] -= corr; } } } - else { + return res; + } + else { + auto tmp = in1 / in2; + using tmpT = typename decltype(tmp)::element_type; #pragma unroll - for (int i = 0; i < vec_sz; ++i) { - if (in2[i] == argT2(0)) { - tmp[i] = tmpT(0); - } + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] != argT2(0)) { + tmp[i] = std::floor(tmp[i]); } } if constexpr (std::is_same_v) { @@ -118,19 +116,12 @@ struct FloorDivideFunctor return vec_cast(tmp); } } - else { - sycl::vec res = sycl::floor(tmp); - if constexpr (std::is_same_v) - { - return res; - } - else { - using dpctl::tensor::type_utils::vec_cast; - return vec_cast(res); - } - } + } + +private: + bool l_xor(bool b1, bool b2) const + { + return (b1 != b2); } }; diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py index 61b77afccd..c8ba5e80f1 100644 --- a/dpctl/tests/elementwise/test_floor_divide.py +++ b/dpctl/tests/elementwise/test_floor_divide.py @@ -203,16 +203,6 @@ def test_floor_divide_gh_1247(): dpt.asnumpy(res), np.full(res.shape, -1, dtype=res.dtype) ) - # attempt to invoke sycl::vec overload using a larger array - x = dpt.arange(-64, 65, 1, dtype="i4") - np.testing.assert_array_equal( - dpt.asnumpy(dpt.floor_divide(x, 3)), np.floor_divide(dpt.asnumpy(x), 3) - ) - np.testing.assert_array_equal( - dpt.asnumpy(dpt.floor_divide(x, -3)), - np.floor_divide(dpt.asnumpy(x), -3), - ) - @pytest.mark.parametrize("dtype", _no_complex_dtypes[1:9]) def test_floor_divide_integer_zero(dtype): @@ -226,10 +216,42 @@ def test_floor_divide_integer_zero(dtype): dpt.asnumpy(res), np.zeros(x.shape, dtype=res.dtype) ) - # attempt to invoke sycl::vec overload using a larger array - x = dpt.arange(129, dtype=dtype, sycl_queue=q) - y = dpt.zeros_like(x, sycl_queue=q) + +def test_floor_divide_special_cases(): + q = get_queue_or_skip() + + x = dpt.empty(1, dtype="f4", sycl_queue=q) + y = dpt.empty_like(x) + x[0], y[0] = dpt.inf, dpt.inf + res = dpt.floor_divide(x, y) + with np.errstate(all="ignore"): + res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y)) + np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + x[0], y[0] = 0.0, -1.0 + res = dpt.floor_divide(x, y) + x_np = dpt.asnumpy(x) + y_np = dpt.asnumpy(y) + res_np = np.floor_divide(x_np, y_np) + np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + res = dpt.floor_divide(y, x) + with np.errstate(all="ignore"): + res_np = np.floor_divide(y_np, x_np) + np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + x[0], y[0] = -1.0, dpt.inf res = dpt.floor_divide(x, y) np.testing.assert_array_equal( - dpt.asnumpy(res), np.zeros(x.shape, dtype=res.dtype) + dpt.asnumpy(res), np.asarray([-0.0], dtype="f4") ) + + res = dpt.floor_divide(y, x) + np.testing.assert_array_equal( + dpt.asnumpy(res), np.asarray([-dpt.inf], dtype="f4") + ) + + x[0], y[0] = 1.0, dpt.nan + res = dpt.floor_divide(x, y) + res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y)) + np.testing.assert_array_equal(dpt.asnumpy(res), res_np)