@@ -1272,6 +1272,33 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1272
1272
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
1273
1273
#endif
1274
1274
}
1275
+ #elif defined(__riscv_v_intrinsic)
1276
+
1277
+ size_t vl = __riscv_vsetvl_e32m4(QK8_0);
1278
+
1279
+ for (int i = 0; i < nb; i++) {
1280
+ // load elements
1281
+ vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
1282
+
1283
+ vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
1284
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl);
1285
+ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
1286
+ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
1287
+
1288
+ const float d = amax / ((1 << 7) - 1);
1289
+ const float id = d ? 1.0f/d : 0.0f;
1290
+
1291
+ y[i].d = GGML_FP32_TO_FP16(d);
1292
+
1293
+ vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
1294
+
1295
+ // convert to integer
1296
+ vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
1297
+ vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
1298
+
1299
+ // store result
1300
+ __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
1301
+ }
1275
1302
#else
1276
1303
// scalar
1277
1304
quantize_row_q8_0_reference(x, y, k);
@@ -1490,6 +1517,41 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1490
1517
_mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
1491
1518
#endif
1492
1519
}
1520
+ #elif defined(__riscv_v_intrinsic)
1521
+
1522
+ size_t vl = __riscv_vsetvl_e32m4(QK8_1);
1523
+
1524
+ for (int i = 0; i < nb; i++) {
1525
+ // load elements
1526
+ vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_1, vl);
1527
+
1528
+ vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
1529
+ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0, vl);
1530
+ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
1531
+ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
1532
+
1533
+ const float d = amax / ((1 << 7) - 1);
1534
+ const float id = d ? 1.0f/d : 0.0f;
1535
+
1536
+ y[i].d = d;
1537
+
1538
+ vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
1539
+
1540
+ // convert to integer
1541
+ vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
1542
+ vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
1543
+
1544
+ // store result
1545
+ __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
1546
+
1547
+ // compute sum for y[i].s
1548
+ vint16m1_t tmp2 = __riscv_vmv_v_x_i16m1(0, vl);
1549
+ vint16m1_t vwrs = __riscv_vwredsum_vs_i8m1_i16m1(vs, tmp2, vl);
1550
+
1551
+ // set y[i].s
1552
+ int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
1553
+ y[i].s = sum*d;
1554
+ }
1493
1555
#else
1494
1556
// scalar
1495
1557
quantize_row_q8_1_reference(x, y, k);
@@ -2662,30 +2724,32 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2662
2724
size_t vl = __riscv_vsetvl_e8m1(qk/2);
2663
2725
2664
2726
for (int i = 0; i < nb; i++) {
2665
- vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2727
+ // load elements
2728
+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
2666
2729
2667
- vint8m1_t y0 = __riscv_vle8_v_i8m1 (y[i].qs, vl);
2668
- vint8m1_t y1 = __riscv_vle8_v_i8m1 (y[i].qs+16, vl);
2730
+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2 (y[i].qs, vl);
2731
+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2 (y[i].qs+16, vl);
2669
2732
2670
- vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2671
- vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2733
+ // mask and store lower part of x, and then upper part
2734
+ vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
2735
+ vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
2672
2736
2673
- vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1 (x_a);
2674
- vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1 (x_l);
2737
+ vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_a);
2738
+ vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_l);
2675
2739
2676
- vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl);
2677
- vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl);
2740
+ // subtract offset
2741
+ vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
2742
+ vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
2678
2743
2679
- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2 (v0, y0, vl);
2680
- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2 (v1, y1, vl);
2744
+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1 (v0, y0, vl);
2745
+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1 (v1, y1, vl);
2681
2746
2682
2747
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2683
2748
2684
- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul1, vec_zero, vl);
2685
- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul2, vec_zero , vl);
2749
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul1, vec_zero, vl);
2750
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul2, vs1 , vl);
2686
2751
2687
- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2688
- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2752
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
2689
2753
2690
2754
sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
2691
2755
}
@@ -2823,27 +2887,28 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
2823
2887
size_t vl = __riscv_vsetvl_e8m1(qk/2);
2824
2888
2825
2889
for (int i = 0; i < nb; i++) {
2826
- vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl);
2890
+ // load elements
2891
+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
2827
2892
2828
- vint8m1_t y0 = __riscv_vle8_v_i8m1 (y[i].qs, vl);
2829
- vint8m1_t y1 = __riscv_vle8_v_i8m1 (y[i].qs+16, vl);
2893
+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2 (y[i].qs, vl);
2894
+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2 (y[i].qs+16, vl);
2830
2895
2831
- vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl);
2832
- vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl);
2896
+ // mask and store lower part of x, and then upper part
2897
+ vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
2898
+ vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
2833
2899
2834
- vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1 (x_a);
2835
- vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1 (x_l);
2900
+ vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_a);
2901
+ vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_l);
2836
2902
2837
- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2 (v0, y0, vl);
2838
- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2 (v1, y1, vl);
2903
+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1 (v0, y0, vl);
2904
+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1 (v1, y1, vl);
2839
2905
2840
2906
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
2841
2907
2842
- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul1, vec_zero, vl);
2843
- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul2, vec_zero , vl);
2908
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul1, vec_zero, vl);
2909
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul2, vs1 , vl);
2844
2910
2845
- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
2846
- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
2911
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
2847
2912
2848
2913
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
2849
2914
}
@@ -3088,66 +3153,61 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3088
3153
3089
3154
uint32_t qh;
3090
3155
3091
- // These temp values are for masking and shift operations
3092
- uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3093
- uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
3094
- 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
3095
-
3096
3156
size_t vl = __riscv_vsetvl_e8m1(qk/2);
3097
3157
3158
+ // These tempory registers are for masking and shift operations
3159
+ vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
3160
+ vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
3161
+
3162
+ vuint32m2_t vt_3 = __riscv_vsll_vx_u32m2(vt_2, 16, vl);
3163
+ vuint32m2_t vt_4 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
3164
+
3098
3165
for (int i = 0; i < nb; i++) {
3099
3166
memcpy(&qh, x[i].qh, sizeof(uint32_t));
3100
3167
3101
- // temporary registers
3102
- vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl);
3103
- vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl);
3104
- vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl);
3105
- vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl);
3106
-
3107
3168
// ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
3108
- vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1 , qh, vl);
3109
- vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4 (xha_0, vt_2 , vl);
3110
- vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4 (xhr_0, 4, vl);
3169
+ vuint32m2_t xha_0 = __riscv_vand_vx_u32m2(vt_2 , qh, vl);
3170
+ vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2 (xha_0, vt_1 , vl);
3171
+ vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2 (xhr_0, 4, vl);
3111
3172
3112
3173
// ((qh & (1u << (j + 16))) >> (j + 12));
3113
- vuint32m4_t xha_1 = __riscv_vand_vx_u32m4 (vt_3, qh, vl);
3114
- vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4 (xha_1, vt_4, vl);
3174
+ vuint32m2_t xha_1 = __riscv_vand_vx_u32m2 (vt_3, qh, vl);
3175
+ vuint32m2_t xhl_1 = __riscv_vsrl_vv_u32m2 (xha_1, vt_4, vl);
3115
3176
3116
3177
// narrowing
3117
- vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2 (xhl_0, vl);
3118
- vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1 (xhc_0, vl);
3178
+ vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1 (xhl_0, vl);
3179
+ vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2 (xhc_0, vl);
3119
3180
3120
- vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2 (xhl_1, vl);
3121
- vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1 (xhc_1, vl);
3181
+ vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1 (xhl_1, vl);
3182
+ vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2 (xhc_1, vl);
3122
3183
3123
3184
// load
3124
- vuint8m1_t tx = __riscv_vle8_v_u8m1 (x[i].qs, vl);
3185
+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2 (x[i].qs, vl);
3125
3186
3126
- vint8m1_t y0 = __riscv_vle8_v_i8m1 (y[i].qs, vl);
3127
- vint8m1_t y1 = __riscv_vle8_v_i8m1 (y[i].qs+16, vl);
3187
+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2 (y[i].qs, vl);
3188
+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2 (y[i].qs+16, vl);
3128
3189
3129
- vuint8m1_t x_at = __riscv_vand_vx_u8m1 (tx, 0x0F, vl);
3130
- vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1 (tx, 0x04, vl);
3190
+ vuint8mf2_t x_at = __riscv_vand_vx_u8mf2 (tx, 0x0F, vl);
3191
+ vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2 (tx, 0x04, vl);
3131
3192
3132
- vuint8m1_t x_a = __riscv_vor_vv_u8m1 (x_at, xh_0, vl);
3133
- vuint8m1_t x_l = __riscv_vor_vv_u8m1 (x_lt, xh_1, vl);
3193
+ vuint8mf2_t x_a = __riscv_vor_vv_u8mf2 (x_at, xh_0, vl);
3194
+ vuint8mf2_t x_l = __riscv_vor_vv_u8mf2 (x_lt, xh_1, vl);
3134
3195
3135
- vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1 (x_a);
3136
- vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1 (x_l);
3196
+ vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_a);
3197
+ vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_l);
3137
3198
3138
- vint8m1_t v0 = __riscv_vsub_vx_i8m1 (x_ai, 16, vl);
3139
- vint8m1_t v1 = __riscv_vsub_vx_i8m1 (x_li, 16, vl);
3199
+ vint8mf2_t v0 = __riscv_vsub_vx_i8mf2 (x_ai, 16, vl);
3200
+ vint8mf2_t v1 = __riscv_vsub_vx_i8mf2 (x_li, 16, vl);
3140
3201
3141
- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2 (v0, y0, vl);
3142
- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2 (v1, y1, vl);
3202
+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1 (v0, y0, vl);
3203
+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1 (v1, y1, vl);
3143
3204
3144
3205
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3145
3206
3146
- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul1, vec_zero, vl);
3147
- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul2, vec_zero , vl);
3207
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul1, vec_zero, vl);
3208
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul2, vs1 , vl);
3148
3209
3149
- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3150
- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3210
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
3151
3211
3152
3212
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi;
3153
3213
}
@@ -3414,62 +3474,58 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3414
3474
3415
3475
uint32_t qh;
3416
3476
3417
- // These temp values are for shift operations
3418
- uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
3419
-
3420
3477
size_t vl = __riscv_vsetvl_e8m1(qk/2);
3421
3478
3479
+ // temporary registers for shift operations
3480
+ vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
3481
+ vuint32m2_t vt_2 = __riscv_vadd_vx_u32m2(vt_1, 12, vl);
3482
+
3422
3483
for (int i = 0; i < nb; i++) {
3423
3484
memcpy(&qh, x[i].qh, sizeof(uint32_t));
3424
3485
3425
- // temporary registers
3426
- vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl);
3427
- vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl);
3428
-
3429
3486
// load qh
3430
- vuint32m4_t vqh = __riscv_vmv_v_x_u32m4 (qh, vl);
3487
+ vuint32m2_t vqh = __riscv_vmv_v_x_u32m2 (qh, vl);
3431
3488
3432
3489
// ((qh >> (j + 0)) << 4) & 0x10;
3433
- vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4 (vqh, vt_1, vl);
3434
- vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4 (xhr_0, 4, vl);
3435
- vuint32m4_t xha_0 = __riscv_vand_vx_u32m4 (xhl_0, 0x10, vl);
3490
+ vuint32m2_t xhr_0 = __riscv_vsrl_vv_u32m2 (vqh, vt_1, vl);
3491
+ vuint32m2_t xhl_0 = __riscv_vsll_vx_u32m2 (xhr_0, 4, vl);
3492
+ vuint32m2_t xha_0 = __riscv_vand_vx_u32m2 (xhl_0, 0x10, vl);
3436
3493
3437
3494
// ((qh >> (j + 12)) ) & 0x10;
3438
- vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4 (vqh, vt_2, vl);
3439
- vuint32m4_t xha_1 = __riscv_vand_vx_u32m4 (xhr_1, 0x10, vl);
3495
+ vuint32m2_t xhr_1 = __riscv_vsrl_vv_u32m2 (vqh, vt_2, vl);
3496
+ vuint32m2_t xha_1 = __riscv_vand_vx_u32m2 (xhr_1, 0x10, vl);
3440
3497
3441
3498
// narrowing
3442
- vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2 (xha_0, vl);
3443
- vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1 (xhc_0, vl);
3499
+ vuint16m1_t xhc_0 = __riscv_vncvt_x_x_w_u16m1 (xha_0, vl);
3500
+ vuint8mf2_t xh_0 = __riscv_vncvt_x_x_w_u8mf2 (xhc_0, vl);
3444
3501
3445
- vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2 (xha_1, vl);
3446
- vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1 (xhc_1, vl);
3502
+ vuint16m1_t xhc_1 = __riscv_vncvt_x_x_w_u16m1 (xha_1, vl);
3503
+ vuint8mf2_t xh_1 = __riscv_vncvt_x_x_w_u8mf2 (xhc_1, vl);
3447
3504
3448
3505
// load
3449
- vuint8m1_t tx = __riscv_vle8_v_u8m1 (x[i].qs, vl);
3506
+ vuint8mf2_t tx = __riscv_vle8_v_u8mf2 (x[i].qs, vl);
3450
3507
3451
- vint8m1_t y0 = __riscv_vle8_v_i8m1 (y[i].qs, vl);
3452
- vint8m1_t y1 = __riscv_vle8_v_i8m1 (y[i].qs+16, vl);
3508
+ vint8mf2_t y0 = __riscv_vle8_v_i8mf2 (y[i].qs, vl);
3509
+ vint8mf2_t y1 = __riscv_vle8_v_i8mf2 (y[i].qs+16, vl);
3453
3510
3454
- vuint8m1_t x_at = __riscv_vand_vx_u8m1 (tx, 0x0F, vl);
3455
- vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1 (tx, 0x04, vl);
3511
+ vuint8mf2_t x_at = __riscv_vand_vx_u8mf2 (tx, 0x0F, vl);
3512
+ vuint8mf2_t x_lt = __riscv_vsrl_vx_u8mf2 (tx, 0x04, vl);
3456
3513
3457
- vuint8m1_t x_a = __riscv_vor_vv_u8m1 (x_at, xh_0, vl);
3458
- vuint8m1_t x_l = __riscv_vor_vv_u8m1 (x_lt, xh_1, vl);
3514
+ vuint8mf2_t x_a = __riscv_vor_vv_u8mf2 (x_at, xh_0, vl);
3515
+ vuint8mf2_t x_l = __riscv_vor_vv_u8mf2 (x_lt, xh_1, vl);
3459
3516
3460
- vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1 (x_a);
3461
- vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1 (x_l);
3517
+ vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_a);
3518
+ vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2 (x_l);
3462
3519
3463
- vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2 (v0, y0, vl);
3464
- vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2 (v1, y1, vl);
3520
+ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1 (v0, y0, vl);
3521
+ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1 (v1, y1, vl);
3465
3522
3466
3523
vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
3467
3524
3468
- vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul1, vec_zero, vl);
3469
- vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1 (vec_mul2, vec_zero , vl);
3525
+ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul1, vec_zero, vl);
3526
+ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1 (vec_mul2, vs1 , vl);
3470
3527
3471
- int sumi = __riscv_vmv_x_s_i32m1_i32(vs1);
3472
- sumi += __riscv_vmv_x_s_i32m1_i32(vs2);
3528
+ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
3473
3529
3474
3530
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
3475
3531
}
0 commit comments