@@ -1272,6 +1272,33 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
12721272 _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
12731273#endif
12741274 }
1275+ #elif defined(__riscv_v_intrinsic)
1276+
1277+ size_t vl = __riscv_vsetvl_e32m4(QK8_0);
1278+
1279+ for (int i = 0; i < nb; i++) {
1280+ // load elements
1281+ vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
1282+
1283+ vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
1284+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
1285+ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
1286+ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
1287+
1288+ const float d = amax / ((1 << 7) - 1);
1289+ const float id = d ? 1.0f/d : 0.0f;
1290+
1291+ y[i].d = GGML_FP32_TO_FP16(d);
1292+
1293+ vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
1294+
1295+ // convert to integer
1296+ vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
1297+ vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
1298+
1299+ // store result
1300+ __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
1301+ }
12751302#else
12761303 // scalar
12771304 quantize_row_q8_0_reference(x, y, k);
@@ -1490,6 +1517,41 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
14901517 _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
14911518#endif
14921519 }
1520+ #elif defined(__riscv_v_intrinsic)
1521+
1522+ size_t vl = __riscv_vsetvl_e32m4(QK8_1);
1523+
1524+ for (int i = 0; i < nb; i++) {
1525+ // load elements
1526+ vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
1527+
1528+ vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
1529+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
1530+ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
1531+ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
1532+
1533+ const float d = amax / ((1 << 7) - 1);
1534+ const float id = d ? 1.0f/d : 0.0f;
1535+
1536+ y[i].d = d;
1537+
1538+ vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
1539+
1540+ // convert to integer
1541+ vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
1542+ vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
1543+
1544+ // store result
1545+ __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
1546+
1547+ // compute sum for y[i].s
1548+ vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
1549+ vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
1550+
1551+ // set y[i].s
1552+ int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
1553+ y[i].s = sum*d;
1554+ }
14931555#else
14941556 // scalar
14951557 quantize_row_q8_1_reference(x, y, k);
@@ -2662,30 +2724,32 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
26622724 size_t vl = __riscv_vsetvl_e8m1(qk/2);
26632725
26642726 for (int i = 0; i < nb; i++) {
2665- vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2727+ // load elements
2728+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
26662729
2667- vint8m1_t y0 = __riscv_vle8_v_i8m1 (y[i].qs, vl);
2668- vint8m1_t y1 = __riscv_vle8_v_i8m1 (y[i].qs+16, vl);
2730+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2 (y[i].qs, vl);
2731+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2 (y[i].qs+16, vl);
26692732
2670- vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2671- vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2733+ // mask and store lower part of x, and then upper part
2734+ vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
2735+ vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
26722736
2673- vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1 (x_a);
2674- vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1 (x_l);
2737+ vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_a);
2738+ vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_l);
26752739
2676- vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2677- vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2740+ // subtract offset
2741+ vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
2742+ vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
26782743
2679- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2 (v0, y0, vl);
2680- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2 (v1, y1, vl);
2744+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1 (v0, y0, vl);
2745+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1 (v1, y1, vl);
26812746
26822747 vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
26832748
2684- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul1, vec_zero, vl);
2685- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul2, vec_zero , vl);
2749+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul1, vec_zero, vl);
2750+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul2, vs1 , vl);
26862751
2687- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2688- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2752+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
26892753
26902754 sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
26912755 }
@@ -2823,27 +2887,28 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
28232887 size_t vl = __riscv_vsetvl_e8m1(qk/2);
28242888
28252889 for (int i = 0; i < nb; i++) {
2826- vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2890+ // load elements
2891+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
28272892
2828- vint8m1_t y0 = __riscv_vle8_v_i8m1 (y[i].qs, vl);
2829- vint8m1_t y1 = __riscv_vle8_v_i8m1 (y[i].qs+16, vl);
2893+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2 (y[i].qs, vl);
2894+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2 (y[i].qs+16, vl);
28302895
2831- vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2832- vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2896+ // mask and store lower part of x, and then upper part
2897+ vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
2898+ vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
28332899
2834- vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1 (x_a);
2835- vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1 (x_l);
2900+ vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_a);
2901+ vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_l);
28362902
2837- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2 (v0, y0, vl);
2838- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2 (v1, y1, vl);
2903+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1 (v0, y0, vl);
2904+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1 (v1, y1, vl);
28392905
28402906 vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
28412907
2842- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul1, vec_zero, vl);
2843- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul2, vec_zero , vl);
2908+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul1, vec_zero, vl);
2909+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul2, vs1 , vl);
28442910
2845- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2846- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2911+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
28472912
28482913 sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
28492914 }
@@ -3088,66 +3153,61 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
30883153
30893154 uint32_t qh;
30903155
3091- // These temp values are for masking and shift operations
3092- uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3093- uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3094- 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3095-
30963156 size_t vl = __riscv_vsetvl_e8m1(qk/2);
30973157
3158+ // These tempory registers are for masking and shift operations
3159+ vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
3160+ vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
3161+
3162+ vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
3163+ vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
3164+
30983165 for (int i = 0; i < nb; i++) {
30993166 memcpy(&qh, x[i].qh, sizeof(uint32_t));
31003167
3101- // temporary registers
3102- vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3103- vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3104- vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3105- vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3106-
31073168 // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3108- vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1 , qh, vl);
3109- vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4 (xha_0, vt_2 , vl);
3110- vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4 (xhr_0, 4, vl);
3169+ vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2 , qh, vl);
3170+ vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2 (xha_0, vt_1 , vl);
3171+ vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2 (xhr_0, 4, vl);
31113172
31123173 // ((qh & (1u << (j + 16))) >> (j + 12));
3113- vuint32m4_t xha_1 = __riscv_vand_vx_u32m4 (vt_3, qh, vl);
3114- vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4 (xha_1, vt_4, vl);
3174+ vuint32m2_t xha_1 = __riscv_vand_vx_u32m2 (vt_3, qh, vl);
3175+ vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2 (xha_1, vt_4, vl);
31153176
31163177 // narrowing
3117- vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2 (xhl_0, vl);
3118- vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1 (xhc_0, vl);
3178+ vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1 (xhl_0, vl);
3179+ vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2 (xhc_0, vl);
31193180
3120- vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2 (xhl_1, vl);
3121- vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1 (xhc_1, vl);
3181+ vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1 (xhl_1, vl);
3182+ vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2 (xhc_1, vl);
31223183
31233184 // load
3124- vuint8m1_t tx = __riscv_vle8_v_u8m1 (x[i].qs, vl);
3185+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2 (x[i].qs, vl);
31253186
3126- vint8m1_t y0 = __riscv_vle8_v_i8m1 (y[i].qs, vl);
3127- vint8m1_t y1 = __riscv_vle8_v_i8m1 (y[i].qs+16, vl);
3187+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2 (y[i].qs, vl);
3188+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2 (y[i].qs+16, vl);
31283189
3129- vuint8m1_t x_at = __riscv_vand_vx_u8m1 (tx, 0x0F, vl);
3130- vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1 (tx, 0x04, vl);
3190+ vuint8mf2_t x_at = __riscv_vand_vx_u8mf2 (tx, 0x0F, vl);
3191+ vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2 (tx, 0x04, vl);
31313192
3132- vuint8m1_t x_a = __riscv_vor_vv_u8m1 (x_at, xh_0, vl);
3133- vuint8m1_t x_l = __riscv_vor_vv_u8m1 (x_lt, xh_1, vl);
3193+ vuint8mf2_t x_a = __riscv_vor_vv_u8mf2 (x_at, xh_0, vl);
3194+ vuint8mf2_t x_l = __riscv_vor_vv_u8mf2 (x_lt, xh_1, vl);
31343195
3135- vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1 (x_a);
3136- vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1 (x_l);
3196+ vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_a);
3197+ vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_l);
31373198
3138- vint8m1_t v0 = __riscv_vsub_vx_i8m1 (x_ai, 16, vl);
3139- vint8m1_t v1 = __riscv_vsub_vx_i8m1 (x_li, 16, vl);
3199+ vint8mf2_t v0 = __riscv_vsub_vx_i8mf2 (x_ai, 16, vl);
3200+ vint8mf2_t v1 = __riscv_vsub_vx_i8mf2 (x_li, 16, vl);
31403201
3141- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2 (v0, y0, vl);
3142- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2 (v1, y1, vl);
3202+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1 (v0, y0, vl);
3203+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1 (v1, y1, vl);
31433204
31443205 vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
31453206
3146- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul1, vec_zero, vl);
3147- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul2, vec_zero , vl);
3207+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul1, vec_zero, vl);
3208+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul2, vs1 , vl);
31483209
3149- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3150- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3210+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
31513211
31523212 sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
31533213 }
@@ -3414,62 +3474,58 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
34143474
34153475 uint32_t qh;
34163476
3417- // These temp values are for shift operations
3418- uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3419-
34203477 size_t vl = __riscv_vsetvl_e8m1(qk/2);
34213478
3479+ // temporary registers for shift operations
3480+ vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
3481+ vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
3482+
34223483 for (int i = 0; i < nb; i++) {
34233484 memcpy(&qh, x[i].qh, sizeof(uint32_t));
34243485
3425- // temporary registers
3426- vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3427- vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3428-
34293486 // load qh
3430- vuint32m4_t vqh = __riscv_vmv_v_x_u32m4 (qh, vl);
3487+ vuint32m2_t vqh = __riscv_vmv_v_x_u32m2 (qh, vl);
34313488
34323489 // ((qh >> (j + 0)) << 4) & 0x10;
3433- vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4 (vqh, vt_1, vl);
3434- vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4 (xhr_0, 4, vl);
3435- vuint32m4_t xha_0 = __riscv_vand_vx_u32m4 (xhl_0, 0x10, vl);
3490+ vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2 (vqh, vt_1, vl);
3491+ vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2 (xhr_0, 4, vl);
3492+ vuint32m2_t xha_0 = __riscv_vand_vx_u32m2 (xhl_0, 0x10, vl);
34363493
34373494 // ((qh >> (j + 12)) ) & 0x10;
3438- vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4 (vqh, vt_2, vl);
3439- vuint32m4_t xha_1 = __riscv_vand_vx_u32m4 (xhr_1, 0x10, vl);
3495+ vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2 (vqh, vt_2, vl);
3496+ vuint32m2_t xha_1 = __riscv_vand_vx_u32m2 (xhr_1, 0x10, vl);
34403497
34413498 // narrowing
3442- vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2 (xha_0, vl);
3443- vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1 (xhc_0, vl);
3499+ vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1 (xha_0, vl);
3500+ vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2 (xhc_0, vl);
34443501
3445- vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2 (xha_1, vl);
3446- vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1 (xhc_1, vl);
3502+ vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1 (xha_1, vl);
3503+ vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2 (xhc_1, vl);
34473504
34483505 // load
3449- vuint8m1_t tx = __riscv_vle8_v_u8m1 (x[i].qs, vl);
3506+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2 (x[i].qs, vl);
34503507
3451- vint8m1_t y0 = __riscv_vle8_v_i8m1 (y[i].qs, vl);
3452- vint8m1_t y1 = __riscv_vle8_v_i8m1 (y[i].qs+16, vl);
3508+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2 (y[i].qs, vl);
3509+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2 (y[i].qs+16, vl);
34533510
3454- vuint8m1_t x_at = __riscv_vand_vx_u8m1 (tx, 0x0F, vl);
3455- vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1 (tx, 0x04, vl);
3511+ vuint8mf2_t x_at = __riscv_vand_vx_u8mf2 (tx, 0x0F, vl);
3512+ vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2 (tx, 0x04, vl);
34563513
3457- vuint8m1_t x_a = __riscv_vor_vv_u8m1 (x_at, xh_0, vl);
3458- vuint8m1_t x_l = __riscv_vor_vv_u8m1 (x_lt, xh_1, vl);
3514+ vuint8mf2_t x_a = __riscv_vor_vv_u8mf2 (x_at, xh_0, vl);
3515+ vuint8mf2_t x_l = __riscv_vor_vv_u8mf2 (x_lt, xh_1, vl);
34593516
3460- vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1 (x_a);
3461- vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1 (x_l);
3517+ vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_a);
3518+ vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_l);
34623519
3463- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2 (v0, y0, vl);
3464- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2 (v1, y1, vl);
3520+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1 (v0, y0, vl);
3521+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1 (v1, y1, vl);
34653522
34663523 vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
34673524
3468- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul1, vec_zero, vl);
3469- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul2, vec_zero , vl);
3525+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul1, vec_zero, vl);
3526+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul2, vs1 , vl);
34703527
3471- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3472- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3528+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
34733529
34743530 sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
34753531 }
0 commit comments