Skip to content

Commit dd13a1b

Browse files
committed
Added RVV intrinsics support for Q8 quantize row and also improved the existing dot product function for risc-v.
The RVV intrinsics is added for the following quantize row functions quantize_row_q8_0 quantize_row_q8_1 The following dot product functions have also been optimized by using LMUL = 1/2 instead of LMUL = 1 ggml_vec_dot_q4_0_q8_0 ggml_vec_dot_q4_1_q8_1 ggml_vec_dot_q5_0_q8_0 ggml_vec_dot_q5_1_q8_1 And vector initialization in Q5 by temporary array is also replaced by the vid intrinsics Signed-off-by: Ahmad Tameem <[email protected]>
1 parent f5ef5cf commit dd13a1b

File tree

1 file changed

+153
-97
lines changed

1 file changed

+153
-97
lines changed

ggml.c

Lines changed: 153 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,33 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
12721272
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
12731273
#endif
12741274
}
1275+
#elif defined(__riscv_v_intrinsic)
1276+
1277+
size_t vl = __riscv_vsetvl_e32m4(QK8_0);
1278+
1279+
for (int i = 0; i < nb; i++) {
1280+
// load elements
1281+
vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
1282+
1283+
vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
1284+
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
1285+
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
1286+
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
1287+
1288+
const float d = amax / ((1 << 7) - 1);
1289+
const float id = d ? 1.0f/d : 0.0f;
1290+
1291+
y[i].d = GGML_FP32_TO_FP16(d);
1292+
1293+
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
1294+
1295+
// convert to integer
1296+
vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
1297+
vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
1298+
1299+
// store result
1300+
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
1301+
}
12751302
#else
12761303
// scalar
12771304
quantize_row_q8_0_reference(x, y, k);
@@ -1490,6 +1517,41 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
14901517
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
14911518
#endif
14921519
}
1520+
#elif defined(__riscv_v_intrinsic)
1521+
1522+
size_t vl = __riscv_vsetvl_e32m4(QK8_1);
1523+
1524+
for (int i = 0; i < nb; i++) {
1525+
// load elements
1526+
vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
1527+
1528+
vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
1529+
vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
1530+
vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
1531+
float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
1532+
1533+
const float d = amax / ((1 << 7) - 1);
1534+
const float id = d ? 1.0f/d : 0.0f;
1535+
1536+
y[i].d = d;
1537+
1538+
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
1539+
1540+
// convert to integer
1541+
vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
1542+
vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
1543+
1544+
// store result
1545+
__riscv_vse8_v_i8m1(y[i].qs , vs, vl);
1546+
1547+
// compute sum for y[i].s
1548+
vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
1549+
vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
1550+
1551+
// set y[i].s
1552+
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
1553+
y[i].s = sum*d;
1554+
}
14931555
#else
14941556
// scalar
14951557
quantize_row_q8_1_reference(x, y, k);
@@ -2662,30 +2724,32 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
26622724
size_t vl = __riscv_vsetvl_e8m1(qk/2);
26632725

26642726
for (int i = 0; i < nb; i++) {
2665-
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2727+
// load elements
2728+
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
26662729

2667-
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2668-
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2730+
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
2731+
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
26692732

2670-
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2671-
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2733+
// mask and store lower part of x, and then upper part
2734+
vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
2735+
vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
26722736

2673-
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2674-
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2737+
vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
2738+
vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
26752739

2676-
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2677-
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2740+
// subtract offset
2741+
vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
2742+
vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
26782743

2679-
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2680-
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2744+
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
2745+
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
26812746

26822747
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
26832748

2684-
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2685-
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2749+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
2750+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
26862751

2687-
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2688-
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2752+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
26892753

26902754
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
26912755
}
@@ -2823,27 +2887,28 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
28232887
size_t vl = __riscv_vsetvl_e8m1(qk/2);
28242888

28252889
for (int i = 0; i < nb; i++) {
2826-
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2890+
// load elements
2891+
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
28272892

2828-
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
2829-
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
2893+
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
2894+
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
28302895

2831-
vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2832-
vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2896+
// mask and store lower part of x, and then upper part
2897+
vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
2898+
vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
28332899

2834-
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
2835-
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
2900+
vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
2901+
vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
28362902

2837-
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
2838-
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
2903+
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
2904+
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
28392905

28402906
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
28412907

2842-
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
2843-
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
2908+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
2909+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
28442910

2845-
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2846-
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2911+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
28472912

28482913
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
28492914
}
@@ -3088,66 +3153,61 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
30883153

30893154
uint32_t qh;
30903155

3091-
// These temp values are for masking and shift operations
3092-
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3093-
uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3094-
0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3095-
30963156
size_t vl = __riscv_vsetvl_e8m1(qk/2);
30973157

3158+
// These tempory registers are for masking and shift operations
3159+
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
3160+
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
3161+
3162+
vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
3163+
vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
3164+
30983165
for (int i = 0; i < nb; i++) {
30993166
memcpy(&qh, x[i].qh, sizeof(uint32_t));
31003167

3101-
// temporary registers
3102-
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3103-
vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3104-
vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3105-
vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3106-
31073168
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3108-
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl);
3109-
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl);
3110-
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3169+
vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2, qh, vl);
3170+
vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(xha_0, vt_1, vl);
3171+
vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
31113172

31123173
// ((qh & (1u << (j + 16))) >> (j + 12));
3113-
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl);
3114-
vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl);
3174+
vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(vt_3, qh, vl);
3175+
vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2(xha_1, vt_4, vl);
31153176

31163177
// narrowing
3117-
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl);
3118-
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3178+
vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xhl_0, vl);
3179+
vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
31193180

3120-
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl);
3121-
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3181+
vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xhl_1, vl);
3182+
vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
31223183

31233184
// load
3124-
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3185+
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
31253186

3126-
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3127-
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3187+
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
3188+
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
31283189

3129-
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3130-
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3190+
vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
3191+
vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
31313192

3132-
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3133-
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3193+
vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
3194+
vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
31343195

3135-
vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3136-
vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3196+
vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
3197+
vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
31373198

3138-
vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl);
3139-
vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl);
3199+
vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 16, vl);
3200+
vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 16, vl);
31403201

3141-
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3142-
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3202+
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
3203+
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
31433204

31443205
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
31453206

3146-
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3147-
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3207+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
3208+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
31483209

3149-
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3150-
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3210+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
31513211

31523212
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
31533213
}
@@ -3414,62 +3474,58 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
34143474

34153475
uint32_t qh;
34163476

3417-
// These temp values are for shift operations
3418-
uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3419-
34203477
size_t vl = __riscv_vsetvl_e8m1(qk/2);
34213478

3479+
// temporary registers for shift operations
3480+
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
3481+
vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
3482+
34223483
for (int i = 0; i < nb; i++) {
34233484
memcpy(&qh, x[i].qh, sizeof(uint32_t));
34243485

3425-
// temporary registers
3426-
vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3427-
vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3428-
34293486
// load qh
3430-
vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl);
3487+
vuint32m2_t vqh = __riscv_vmv_v_x_u32m2(qh, vl);
34313488

34323489
// ((qh >> (j + 0)) << 4) & 0x10;
3433-
vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl);
3434-
vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl);
3435-
vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl);
3490+
vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2(vqh, vt_1, vl);
3491+
vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2(xhr_0, 4, vl);
3492+
vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(xhl_0, 0x10, vl);
34363493

34373494
// ((qh >> (j + 12)) ) & 0x10;
3438-
vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl);
3439-
vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl);
3495+
vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2(vqh, vt_2, vl);
3496+
vuint32m2_t xha_1 = __riscv_vand_vx_u32m2(xhr_1, 0x10, vl);
34403497

34413498
// narrowing
3442-
vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl);
3443-
vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl);
3499+
vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1(xha_0, vl);
3500+
vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2(xhc_0, vl);
34443501

3445-
vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl);
3446-
vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl);
3502+
vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1(xha_1, vl);
3503+
vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2(xhc_1, vl);
34473504

34483505
// load
3449-
vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
3506+
vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
34503507

3451-
vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl);
3452-
vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl);
3508+
vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
3509+
vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
34533510

3454-
vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
3455-
vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
3511+
vuint8mf2_t x_at = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
3512+
vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
34563513

3457-
vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl);
3458-
vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl);
3514+
vuint8mf2_t x_a = __riscv_vor_vv_u8mf2(x_at, xh_0, vl);
3515+
vuint8mf2_t x_l = __riscv_vor_vv_u8mf2(x_lt, xh_1, vl);
34593516

3460-
vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a);
3461-
vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l);
3517+
vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
3518+
vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
34623519

3463-
vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl);
3464-
vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl);
3520+
vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
3521+
vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
34653522

34663523
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
34673524

3468-
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl);
3469-
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl);
3525+
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
3526+
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
34703527

3471-
int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3472-
sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3528+
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
34733529

34743530
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
34753531
}

0 commit comments

Comments
 (0)