-
Notifications
You must be signed in to change notification settings - Fork 32
Description
I'm a bit unsure of how I can submit feedback because I see two major issues and it's yet unclear to me how much those might be entangled. This ticket tries to sum it up.
Report of wrong compiled code generated by numba_dpex
matmul
kernel gives wrong result despite no error in the kernel found so far: see 2D vector sum using usm_ndarrays does not work in kernels #871 (comment) and thread that follows.
python code snippet with unexpected output
import sklearn_numba_dpex
import numba_dpex as dpex
import numpy as np
import dpctl.tensor as dpt
square_block_side = 2
work_group_size = (square_block_side, square_block_side)
dtype = np.float32
@dpex.kernel
def matmul(
X, # IN READ-ONLY (X_n_rows, n_cols)
y, # IN READ-ONLY (n_cols, y_n_rows),
result, # OUT (X_n_rows, y_n_rows)
):
X_n_rows = X.shape[0]
Y_n_cols = y.shape[1]
n_cols = X.shape[1]
result_row_idx = dpex.get_global_id(0)
result_col_idx = dpex.get_global_id(1)
local_row_idx = dpex.get_local_id(0)
local_col_idx = dpex.get_local_id(1)
n_blocks_for_cols = n_cols // square_block_side
if (n_cols % square_block_side) > 0:
n_blocks_for_cols += 1
X_sliding_window = dpex.local.array(shape=work_group_size, dtype=dtype)
Y_sliding_window = dpex.local.array(shape=work_group_size, dtype=dtype)
output = dtype(0)
for block_idx in range(n_blocks_for_cols):
X_sliding_window[local_row_idx, local_col_idx] = dtype(0)
Y_sliding_window[local_row_idx, local_col_idx] = dtype(0)
if (result_row_idx < X_n_rows) and (
(local_col_idx + (square_block_side * block_idx)) < n_cols
):
X_sliding_window[local_row_idx, local_col_idx] = X[
result_row_idx, local_col_idx + (square_block_side * block_idx)
]
if (result_col_idx < Y_n_cols) and (
(local_row_idx + (square_block_side * block_idx)) < n_cols
):
Y_sliding_window[local_row_idx, local_col_idx] = y[
local_row_idx + (square_block_side * block_idx), result_col_idx
]
dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
for idx in range(square_block_side):
output += (
X_sliding_window[local_row_idx, idx]
* Y_sliding_window[idx, local_col_idx]
)
dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)
if (result_row_idx < X_n_rows) and (result_col_idx < Y_n_cols):
result[result_row_idx, result_col_idx] = output
def _arange_reshaped(shape, dtype):
n_items = shape[0] * shape[1]
return np.arange(n_items, dtype=dtype).reshape(shape)
X = _arange_reshaped((5, 5), dtype)
Y = _arange_reshaped((5, 5), dtype)
print(np.matmul(X, Y))
X = dpt.asarray(X)
Y = dpt.asarray(Y)
device = X.device.sycl_device
result = dpt.zeros((5, 5), dtype, device=device)
matmul[(6,6), (2,2)](X, Y, result)
print(result)
Equivalent DPC++ snippet provided by @oleksandr-pavlyk that gives expected output
#include <CL/sycl.hpp>
#include <vector>
#include <iostream>
template <typename T>
sycl::event
gemm(
sycl::queue q,
const T* X_ptr, const T* Y_ptr, T* R_ptr,
size_t X_n_rows,
size_t X_n_cols,
size_t Y_n_cols,
size_t gws0_blocks, size_t gws1_blocks, size_t lws0, size_t lws1,
size_t square_block_size,
const std::vector<sycl::event> &depends={}
)
{
sycl::event comp_ev =
q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
sycl::local_accessor<T, 2> X_sliding_window(
sycl::range<2>(lws0, lws1), cgh);
sycl::local_accessor<T, 2> Y_sliding_window(
sycl::range<2>(lws0, lws1), cgh);
auto gwsRange = sycl::range<2>(
gws0_blocks*lws0, gws1_blocks*lws1);
auto lwsRange = sycl::range<2>(
lws0, lws1);
size_t n_cols = X_n_cols;
cgh.parallel_for(
sycl::nd_range<2>(gwsRange, lwsRange),
[=](sycl::nd_item<2> ndit) {
size_t result_row_idx = ndit.get_global_id(0);
size_t result_col_idx = ndit.get_global_id(1);
size_t local_row_idx = ndit.get_local_id(0);
size_t local_col_idx = ndit.get_local_id(1);
size_t n_blocks_for_cols = (n_cols + square_block_size - 1) / square_block_size;
T output(0);
for(size_t block_idx = 0; block_idx < n_blocks_for_cols; ++block_idx) {
size_t col_idx = local_col_idx + square_block_size * block_idx;
T X_v = (result_row_idx < X_n_rows && col_idx < n_cols) ? X_ptr[result_row_idx * X_n_cols + col_idx] : T(0);
X_sliding_window[ndit.get_local_id()] = X_v;
size_t row_idx = local_row_idx + square_block_size * block_idx;
T Y_v = (result_col_idx < Y_n_cols && row_idx < n_cols) ? Y_ptr[row_idx * Y_n_cols + result_col_idx] : T(0);
Y_sliding_window[ndit.get_local_id()] = Y_v;
ndit.barrier(sycl::access::fence_space::local_space);
for(size_t idx = 0; idx < square_block_size; ++idx) {
output += X_sliding_window[sycl::id<2>(local_row_idx, idx)] * Y_sliding_window[sycl::id<2>(idx, local_col_idx)];
}
ndit.barrier(sycl::access::fence_space::local_space);
}
if (result_row_idx < X_n_rows && result_col_idx < Y_n_cols) {
R_ptr[result_row_idx * Y_n_cols + result_col_idx] = output;
}
}
);
});
return comp_ev;
}
int main(void) {
using T = float;
size_t n = 5;
size_t square_block_side = 2;
sycl::queue q{ sycl::default_selector_v };
size_t n_elems = n * n;
T *X_ptr = sycl::malloc_device<T>(n_elems, q);
T *Y_ptr = sycl::malloc_device<T>(n_elems, q);
T *R_ptr = sycl::malloc_device<T>(n_elems, q);
sycl::event pop_ev =
q.submit([&](sycl::handler &cgh) {
cgh.parallel_for({n_elems}, [=](sycl::id<1> id) { T v(id[0]); X_ptr[id] = v; Y_ptr[id] = v; R_ptr[id] = T(-7); });
});
size_t lws0 = 2;
size_t lws1 = 2;
size_t gws0_blocks = (n + lws0 - 1)/ lws0;
size_t gws1_blocks = (n + lws1 - 1)/ lws1;
sycl::event comp_ev =
gemm(q, X_ptr, Y_ptr, R_ptr, n, n, n, gws0_blocks, gws1_blocks, lws0, lws1, lws0, {pop_ev});
T* host_res = new T[n_elems];
sycl::event copy_ev = q.copy<T>(R_ptr, host_res, n_elems, {comp_ev});
copy_ev.wait();
for(size_t i0 = 0; i0 < n; ++i0) {
std::cout << "[ ";
for(size_t i1 = 0; i1 < n; ++i1) {
std::cout << host_res[ i0 * n + i1] << " ";
}
std::cout << "]" << std::endl;
}
delete[] host_res;
sycl::free(X_ptr, q);
sycl::free(Y_ptr, q);
sycl::free(R_ptr, q);
return 0;
}
-
This issue can be reproduced in commits anterior and posterior to Refactor/kernel interfaces #804
-
curiously enough, it doesn't seem to affect the kernels we've written for
KMeans
so far, despite those being rather complicated and with a good test coverage. -
maybe related: get_global_id() indexing order doesn't follow that of SYCL indexing #889 and 2D vector sum using usm_ndarrays does not work in kernels #871 (comment)
Report of new issues introduced by refactoring in #804
I can tell that our KMeans
code started to display new bugs (on GPU: huge slowdown, wrong output, on CPU: segfaults), after #804 was merged. It's been reported at #886 but I just realized I misfiled the report when I suggest this comes from #877 rather tan #804 🤦 . Unfortunately #888 does not fix the issues. I haven't investigated yet to file a more accurate report with minimal reproducer ~ it's still on my todo list.
But I think that the issues above should be cleared first, if there are indeed issues with compiled code not matching python code everything else is going to be too hard to debug, and actually fixing it might also fix this.
@diptorupd @chudur-budur maybe investigating could be easier if we schedule a live debugging session ?