diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 789ba97bfba39..c2fb20c17e910 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -4436,6 +4436,24 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri #endif } +inline bool ggml_sycl_supports_mmq(enum ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return true; + default: + return false; + } +} + template static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, const sycl::nd_item<3> &item_ct1, @@ -4581,6 +4599,36 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr } +template +static void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2s_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + const int i = item_ct1.get_group(2); + const block_iq2_s * x = (const block_iq2_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq2s_grid + qs[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq2s_grid + qs[2*il+1]); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + + /* DPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register @@ -7497,6 +7545,57 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, #endif } +static __dpct_inline__ float +vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint64_t *iq2s_grid, const uint64_t *ksigns64) { +#if QK_K == 256 + const block_iq2_s * bq2 = (const block_iq2_s *) vbq; + + const int ib32 = iqs; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + (void) ksigns64; + assert(false); + return 0.f; +#endif +} + template @@ -8353,6 +8452,53 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * } } + +template +static void mul_mat_vec_q_iq2_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2s_grid_ptr, const uint64_t *ksigns64_ptr ) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs, iq2s_grid_ptr, ksigns64_ptr); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + template static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, const sycl::nd_item<3> &item_ct1) { @@ -10096,6 +10242,36 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, } } +template +static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + iq2s_grid.init(*stream); + ksigns_iq2xs.init(*stream); + kmask_iq2xs.init(*stream); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + auto iq2s_grid_ptr_ct1 = iq2s_grid.get_ptr(); + auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); + auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_s( + vx, y, item_ct1, iq2s_grid_ptr_ct1, + ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + }); + }); + } +} + + template static void convert_unary_sycl(const void *__restrict__ vx, dst_t *__restrict__ y, const int k, @@ -10150,6 +10326,8 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try { return dequantize_row_iq3_s_sycl; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; case GGML_TYPE_F32: return convert_unary_sycl; default: @@ -10194,6 +10372,8 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { return dequantize_row_iq3_s_sycl; case GGML_TYPE_IQ1_S: return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; case GGML_TYPE_F16: return convert_unary_sycl; default: @@ -10839,6 +11019,35 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, } } +static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + iq2s_grid.init(*stream); + ksigns64.init(*stream); + + stream->submit([&](sycl::handler &cgh) { + auto iq2s_grid_ptr_ct1 = iq2s_grid.get_ptr(); + auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq2_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1, + iq2s_grid_ptr_ct1, ksigns64_ptr_ct1); + }); + }); + } +} + + static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols_x, const int nrows_x, const int ncols_y, @@ -13612,6 +13821,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= VER_GEN9 ? 128 : 64; case GGML_TYPE_IQ3_S: @@ -13631,7 +13841,8 @@ inline void ggml_sycl_op_mul_mat_vec_q( const int64_t src1_ncols, const int64_t src1_padded_row_size, const dpct::queue_ptr &stream) { - GGML_ASSERT(ggml_nrows(src1) == 1); + //GGML_ASSERT(ggml_nrows(src1) == 1); + //GGML_ASSERT(ne10 % QK8_1 == 0); const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; @@ -13682,6 +13893,9 @@ inline void ggml_sycl_op_mul_mat_vec_q( case GGML_TYPE_IQ1_S: mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; default: GGML_ASSERT(false); break; @@ -13758,6 +13972,24 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec( case GGML_TYPE_Q6_K: dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_F16: convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; @@ -15177,7 +15409,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 #ifdef GGML_SYCL_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else - const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; + const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); #endif // GGML_SYCL_FORCE_DMMV if (use_mul_mat_vec_q) { @@ -15189,7 +15421,8 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); } } else { - bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); + bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) + && ggml_sycl_supports_mmq(src0->type); if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { use_mul_mat_q = false; @@ -17031,9 +17264,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || - a_type == GGML_TYPE_IQ4_XS) { + // No support in mmvq or other methods + if (a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS) return false; + + // Support in mmvq + if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ3_S || a_type == GGML_TYPE_IQ2_S ) { + // condition for using mmvq + if (b->ne[1] > 1 || a->ne[0] % GGML_SYCL_DMMV_X != 0) { + return false; + } } return true; } break;