Skip to content

Sum up of dpex.kernel JIT issues #891

@fcharras

Description

@fcharras

I'm a bit unsure of how I can submit feedback because I see two major issues and it's yet unclear to me how much those might be entangled. This ticket tries to sum it up.

Report of wrong compiled code generated by numba_dpex

python code snippet with unexpected output
import sklearn_numba_dpex
import numba_dpex as dpex
import numpy as np
import dpctl.tensor as dpt

square_block_side = 2
work_group_size = (square_block_side, square_block_side)
dtype = np.float32


@dpex.kernel
def matmul(
    X,         # IN READ-ONLY    (X_n_rows, n_cols)
    y,         # IN READ-ONLY    (n_cols, y_n_rows),
    result,    # OUT             (X_n_rows, y_n_rows)
):

    X_n_rows = X.shape[0]
    Y_n_cols = y.shape[1]
    n_cols = X.shape[1]

    result_row_idx = dpex.get_global_id(0)
    result_col_idx = dpex.get_global_id(1)

    local_row_idx = dpex.get_local_id(0)
    local_col_idx = dpex.get_local_id(1)

    n_blocks_for_cols = n_cols // square_block_side
    if (n_cols % square_block_side) > 0:
        n_blocks_for_cols += 1

    X_sliding_window = dpex.local.array(shape=work_group_size, dtype=dtype)
    Y_sliding_window = dpex.local.array(shape=work_group_size, dtype=dtype)

    output = dtype(0)

    for block_idx in range(n_blocks_for_cols):
        X_sliding_window[local_row_idx, local_col_idx] = dtype(0)
        Y_sliding_window[local_row_idx, local_col_idx] = dtype(0)
        if (result_row_idx < X_n_rows) and (
            (local_col_idx + (square_block_side * block_idx)) < n_cols
        ):
            X_sliding_window[local_row_idx, local_col_idx] = X[
                result_row_idx, local_col_idx + (square_block_side * block_idx)
            ]

        if (result_col_idx < Y_n_cols) and (
            (local_row_idx + (square_block_side * block_idx)) < n_cols
        ):
            Y_sliding_window[local_row_idx, local_col_idx] = y[
                local_row_idx + (square_block_side * block_idx), result_col_idx
            ]

        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)

        for idx in range(square_block_side):
            output += (
                X_sliding_window[local_row_idx, idx]
                * Y_sliding_window[idx, local_col_idx]
            )

        dpex.barrier(dpex.CLK_LOCAL_MEM_FENCE)

    if (result_row_idx < X_n_rows) and (result_col_idx < Y_n_cols):
        result[result_row_idx, result_col_idx] = output


def _arange_reshaped(shape, dtype):
    n_items = shape[0] * shape[1]
    return np.arange(n_items, dtype=dtype).reshape(shape)


X = _arange_reshaped((5, 5), dtype)
Y = _arange_reshaped((5, 5), dtype)

print(np.matmul(X, Y))

X = dpt.asarray(X)
Y = dpt.asarray(Y)

device = X.device.sycl_device
result = dpt.zeros((5, 5), dtype, device=device)

matmul[(6,6), (2,2)](X, Y, result)

print(result)
Equivalent DPC++ snippet provided by @oleksandr-pavlyk that gives expected output
#include <CL/sycl.hpp>
#include <vector>
#include <iostream>

template <typename T>
sycl::event
gemm(
  sycl::queue q,
  const T* X_ptr, const T* Y_ptr, T* R_ptr,
  size_t X_n_rows,
  size_t X_n_cols,
  size_t Y_n_cols,
  size_t gws0_blocks, size_t gws1_blocks, size_t lws0, size_t lws1,
  size_t square_block_size,
  const std::vector<sycl::event> &depends={}
)
{
  sycl::event comp_ev =
    q.submit([&](sycl::handler &cgh) {
        cgh.depends_on(depends);

        sycl::local_accessor<T, 2> X_sliding_window(
             sycl::range<2>(lws0, lws1), cgh);
        sycl::local_accessor<T, 2> Y_sliding_window(
             sycl::range<2>(lws0, lws1), cgh);

        auto gwsRange = sycl::range<2>(
             gws0_blocks*lws0, gws1_blocks*lws1);
        auto lwsRange = sycl::range<2>(
             lws0, lws1);
        size_t n_cols = X_n_cols;
        cgh.parallel_for(
           sycl::nd_range<2>(gwsRange, lwsRange),
           [=](sycl::nd_item<2> ndit) {
             size_t result_row_idx = ndit.get_global_id(0);
             size_t result_col_idx = ndit.get_global_id(1);
             size_t local_row_idx = ndit.get_local_id(0);
             size_t local_col_idx = ndit.get_local_id(1);

             size_t n_blocks_for_cols = (n_cols + square_block_size - 1) / square_block_size;

             T output(0);
             for(size_t block_idx = 0; block_idx < n_blocks_for_cols; ++block_idx) {
               size_t col_idx = local_col_idx + square_block_size * block_idx;
               T X_v = (result_row_idx < X_n_rows && col_idx < n_cols) ? X_ptr[result_row_idx * X_n_cols + col_idx] : T(0);
               X_sliding_window[ndit.get_local_id()] = X_v;

               size_t row_idx = local_row_idx + square_block_size * block_idx;
               T Y_v = (result_col_idx < Y_n_cols && row_idx < n_cols) ? Y_ptr[row_idx * Y_n_cols + result_col_idx] : T(0);

               Y_sliding_window[ndit.get_local_id()] = Y_v;

               ndit.barrier(sycl::access::fence_space::local_space);

               for(size_t idx = 0; idx < square_block_size; ++idx) {
                 output += X_sliding_window[sycl::id<2>(local_row_idx, idx)] * Y_sliding_window[sycl::id<2>(idx, local_col_idx)];
               }

               ndit.barrier(sycl::access::fence_space::local_space);

             }
             if (result_row_idx < X_n_rows && result_col_idx < Y_n_cols) {
               R_ptr[result_row_idx * Y_n_cols + result_col_idx] = output;
             }
           }
        );
    });

  return comp_ev;
}

int main(void) {
  using T = float;
  size_t n = 5;

  size_t square_block_side = 2;

  sycl::queue q{ sycl::default_selector_v };

  size_t n_elems = n * n;
  T *X_ptr = sycl::malloc_device<T>(n_elems, q);
  T *Y_ptr = sycl::malloc_device<T>(n_elems, q);
  T *R_ptr = sycl::malloc_device<T>(n_elems, q);

  sycl::event pop_ev =
    q.submit([&](sycl::handler &cgh) {
       cgh.parallel_for({n_elems}, [=](sycl::id<1> id) { T v(id[0]); X_ptr[id] = v; Y_ptr[id] = v; R_ptr[id] = T(-7); });
    });

  size_t lws0 = 2;
  size_t lws1 = 2;
  size_t gws0_blocks = (n + lws0 - 1)/ lws0;
  size_t gws1_blocks = (n + lws1 - 1)/ lws1;

  sycl::event comp_ev =
    gemm(q, X_ptr, Y_ptr, R_ptr, n, n, n, gws0_blocks, gws1_blocks, lws0, lws1, lws0, {pop_ev});

  T* host_res = new T[n_elems];

  sycl::event copy_ev = q.copy<T>(R_ptr, host_res, n_elems, {comp_ev});

  copy_ev.wait();



  for(size_t i0 = 0; i0 < n; ++i0) {
    std::cout << "[ ";
    for(size_t i1 = 0; i1 < n; ++i1) {
      std::cout << host_res[ i0 * n + i1] << " ";
    }
    std::cout << "]" << std::endl;
  }

  delete[] host_res;

  sycl::free(X_ptr, q);
  sycl::free(Y_ptr, q);
  sycl::free(R_ptr, q);

  return 0;
}

Report of new issues introduced by refactoring in #804

I can tell that our KMeans code started to display new bugs (on GPU: huge slowdown, wrong output, on CPU: segfaults), after #804 was merged. It's been reported at #886 but I just realized I misfiled the report when I suggest this comes from #877 rather tan #804 🤦 . Unfortunately #888 does not fix the issues. I haven't investigated yet to file a more accurate report with minimal reproducer ~ it's still on my todo list.

But I think that the issues above should be cleared first, if there are indeed issues with compiled code not matching python code everything else is going to be too hard to debug, and actually fixing it might also fix this.

@diptorupd @chudur-budur maybe investigating could be easier if we schedule a live debugging session ?

Metadata

Metadata

Assignees

No one assigned

    Labels

    userUser submitted issue

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions