-
Notifications
You must be signed in to change notification settings - Fork 14.1k
ggml-cpu: extend support for RVV floating-point kernels #17318
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
28fcd3e
96128a9
1bda8fb
e07238d
3fb0501
addf578
2786a97
e5c8adb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3305,13 +3305,33 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) { | |
| __m128 y_vec = _mm_cvtph_ps(x_vec); | ||
| _mm_storeu_ps(y + i, y_vec); | ||
| } | ||
| #elif defined(__riscv_zvfh) | ||
| for (int vl; i < n; i += vl) { | ||
| vl = __riscv_vsetvl_e16m1(n - i); | ||
| vfloat16m1_t vx = __riscv_vle16_v_f16m1((_Float16 *)&x[i], vl); | ||
| vfloat32m2_t vy = __riscv_vfwcvt_f_f_v_f32m2(vx, vl); | ||
| __riscv_vse32_v_f32m2(&y[i], vy, vl); | ||
|
|
||
| #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfhmin) | ||
| // calculate step size | ||
| const int epr = __riscv_vsetvlmax_e16m2(); | ||
| const int step = epr * 2; | ||
| const int np = (n & ~(step - 1)); | ||
|
|
||
| // unroll by 2 | ||
| for (; i < np; i += step) { | ||
| vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, epr); | ||
| vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, epr); | ||
| __riscv_vse32_v_f32m4(y + i, ay0, epr); | ||
|
|
||
| vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16*)x + i + epr, epr); | ||
| vfloat32m4_t ay1 = __riscv_vfwcvt_f_f_v_f32m4(ax1, epr); | ||
| __riscv_vse32_v_f32m4(y + i + epr, ay1, epr); | ||
| } | ||
|
|
||
| // leftovers | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can eliminate the separate leftover loop by configuring the vector length directly within the main loop. This simplifies the code and enables the CPU implementation to distribute tail elements more evenly. There are some examples in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming we are keeping the unroll 2 (see https://github.com/ggml-org/llama.cpp/pull/17318/files#r2564828448), this leftover loop allows to treat the elements left in a vectorized manner. There is redundancy between this loop and the scalar one, however the compiler is smart enough to remove the scalar loop. |
||
| int vl; | ||
| for (i = np; i < n; i += vl) { | ||
| vl = __riscv_vsetvl_e16m2(n - i); | ||
| vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16*)x + i, vl); | ||
| vfloat32m4_t ay0 = __riscv_vfwcvt_f_f_v_f32m4(ax0, vl); | ||
| __riscv_vse32_v_f32m4(y + i, ay0, vl); | ||
| } | ||
|
|
||
| #endif | ||
|
|
||
| for (; i < n; ++i) { | ||
|
|
@@ -3356,6 +3376,31 @@ void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) { | |
| (const __m128i *)(x + i))), | ||
| 16))); | ||
| } | ||
| #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfmin) | ||
| // calculate step size | ||
| const int epr = __riscv_vsetvlmax_e16m2(); | ||
| const int step = epr * 2; | ||
| const int np = (n & ~(step - 1)); | ||
|
|
||
| // unroll by 2 | ||
| for (; i < np; i += step) { | ||
| vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, epr); | ||
| vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, epr); | ||
| __riscv_vse32_v_f32m4(y + i, ay0, epr); | ||
|
|
||
| vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16*)x + i + epr, epr); | ||
| vfloat32m4_t ay1 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax1, epr); | ||
| __riscv_vse32_v_f32m4(y + i + epr, ay1, epr); | ||
| } | ||
|
|
||
| // leftovers | ||
| int vl; | ||
| for (i = np; i < n; i += vl) { | ||
| vl = __riscv_vsetvl_e16m2(n - i); | ||
| vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16*)x + i, vl); | ||
| vfloat32m4_t ay0 = __riscv_vfwcvtbf16_f_f_v_f32m4(ax0, vl); | ||
| __riscv_vse32_v_f32m4(y + i, ay0, vl); | ||
| } | ||
|
Comment on lines
+3385
to
+3403
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://docs.google.com/presentation/d/1Vrb4qt8YBt0pbiOA4-z2XcIcZIbLwizJa7-s5DclGpo/edit?slide=id.g39983ae8256_0_47#slide=id.g39983ae8256_0_47, |
||
| #endif | ||
| for (; i < n; i++) { | ||
| y[i] = GGML_BF16_TO_FP32(x[i]); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -195,8 +195,48 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * | |
| sumf += (ggml_float)_mm_cvtss_f32(g); | ||
|
|
||
| #undef LOAD | ||
| #endif | ||
| #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfbfwma) | ||
| size_t vl = __riscv_vsetvlmax_e32m4(); | ||
|
|
||
| // initialize accumulators to all zeroes | ||
| vfloat32m4_t vsum0 = __riscv_vfmv_v_f_f32m4(0.0f, vl); | ||
| vfloat32m4_t vsum1 = __riscv_vfmv_v_f_f32m4(0.0f, vl); | ||
|
Comment on lines
+202
to
+203
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider increasing LMUL for unrolling to prevent code duplication. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similarly to |
||
|
|
||
| // calculate step size | ||
| const size_t epr = __riscv_vsetvlmax_e16m2(); | ||
| const size_t step = epr * 2; | ||
| const int np = (n & ~(step - 1)); | ||
|
|
||
| // unroll by 2 | ||
| for (; i < np; i += step) { | ||
| vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], epr); | ||
| vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], epr); | ||
| vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, epr); | ||
| __asm__ __volatile__ ("" ::: "memory"); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| vbfloat16m2_t ax1 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i + epr], epr); | ||
| vbfloat16m2_t ay1 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i + epr], epr); | ||
| vsum1 = __riscv_vfwmaccbf16_vv_f32m4(vsum1, ax1, ay1, epr); | ||
| __asm__ __volatile__ ("" ::: "memory"); | ||
| } | ||
|
|
||
| // accumulate in 1 register | ||
| vsum0 = __riscv_vfadd_vv_f32m4(vsum0, vsum1, vl); | ||
|
|
||
| // leftovers | ||
| for (i = np; i < n; i += vl) { | ||
| vl = __riscv_vsetvl_e16m2(n - i); | ||
| vbfloat16m2_t ax0 = __riscv_vle16_v_bf16m2((const __bf16 *)&x[i], vl); | ||
| vbfloat16m2_t ay0 = __riscv_vle16_v_bf16m2((const __bf16 *)&y[i], vl); | ||
| vsum0 = __riscv_vfwmaccbf16_vv_f32m4(vsum0, ax0, ay0, vl); | ||
| } | ||
|
|
||
| // reduce | ||
| vl = __riscv_vsetvlmax_e32m4(); | ||
| vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); | ||
| sumf += __riscv_vfmv_f_s_f32m1_f32(redsum); | ||
|
|
||
| #endif | ||
| for (; i < n; ++i) { | ||
| sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) * | ||
| GGML_BF16_TO_FP32(y[i])); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG | |
| } | ||
| GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03); | ||
| GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13); | ||
| #elif defined(__riscv_v_intrinsic) | ||
| // todo: RVV impl | ||
| for (int i = 0; i < n; ++i) { | ||
| for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { | ||
| sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); | ||
| } | ||
| } | ||
|
|
||
| #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh) | ||
| size_t vl = __riscv_vsetvlmax_e32m4(); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same suggestions from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| // initialize accumulators to all zeroes | ||
| vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl); | ||
| vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl); | ||
| vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4(0.0f, vl); | ||
| vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4(0.0f, vl); | ||
|
|
||
| // calculate step size | ||
| const size_t epr = __riscv_vsetvlmax_e16m2(); | ||
| const size_t step = epr * 2; | ||
| const int np = (n & ~(step - 1)); | ||
|
|
||
| // unroll by 2 along the row dimension | ||
| for (int i = 0; i < np; i += step) { | ||
| vfloat16m2_t ay0 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), epr); | ||
| vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), epr); | ||
| vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), epr); | ||
| vsum0_0 = __riscv_vfwmacc_vv_f32m4(vsum0_0, ax0_0, ay0, epr); | ||
| vsum1_0 = __riscv_vfwmacc_vv_f32m4(vsum1_0, ax1_0, ay0, epr); | ||
|
|
||
| vfloat16m2_t ay1 = __riscv_vle16_v_f16m2((const _Float16 *)(y + i + epr), epr); | ||
| vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i + epr), epr); | ||
| vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i + epr), epr); | ||
| vsum0_1 = __riscv_vfwmacc_vv_f32m4(vsum0_1, ax0_1, ay1, epr); | ||
| vsum1_1 = __riscv_vfwmacc_vv_f32m4(vsum1_1, ax1_1, ay1, epr); | ||
| } | ||
|
|
||
| vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4(vsum0_0, vsum0_1, vl); | ||
| vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4(vsum1_0, vsum1_1, vl); | ||
|
|
||
| // leftovers | ||
| for (int i = np; i < n; i += vl) { | ||
| vl = __riscv_vsetvl_e16m2(n - i); | ||
| vfloat16m2_t ay = __riscv_vle16_v_f16m2((const _Float16 *)(y + i), vl); | ||
| vfloat16m2_t ax0 = __riscv_vle16_v_f16m2((const _Float16 *)(x[0] + i), vl); | ||
| vfloat16m2_t ax1 = __riscv_vle16_v_f16m2((const _Float16 *)(x[1] + i), vl); | ||
|
|
||
| vsum0 = __riscv_vfwmacc_vv_f32m4(vsum0, ax0, ay, vl); | ||
| vsum1 = __riscv_vfwmacc_vv_f32m4(vsum1, ax1, ay, vl); | ||
| } | ||
|
|
||
| // reduce | ||
| vl = __riscv_vsetvlmax_e32m2(); | ||
| vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum0, 0), | ||
| __riscv_vget_v_f32m4_f32m2(vsum0, 1), vl); | ||
| vl = __riscv_vsetvlmax_e32m1(); | ||
| vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc0_0, 0), | ||
| __riscv_vget_v_f32m2_f32m1(acc0_0, 1), vl); | ||
| vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1( | ||
| acc0_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); | ||
|
|
||
| vl = __riscv_vsetvlmax_e32m2(); | ||
| vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(vsum1, 0), | ||
| __riscv_vget_v_f32m4_f32m2(vsum1, 1), vl); | ||
| vl = __riscv_vsetvlmax_e32m1(); | ||
| vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(acc1_0, 0), | ||
| __riscv_vget_v_f32m2_f32m1(acc1_0, 1), vl); | ||
| vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1( | ||
| acc1_1, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl); | ||
| sumf[0] = __riscv_vfmv_f_s_f32m1_f32(redsum0); | ||
| sumf[1] = __riscv_vfmv_f_s_f32m1_f32(redsum1); | ||
|
|
||
| #else | ||
| const int np = (n & ~(GGML_F16_STEP - 1)); | ||
|
|
||
|
|
@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, | |
| } | ||
| np = n; | ||
| #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic | ||
| const int np = n; | ||
| _Float16 hv = (_Float16)v; | ||
| for (int i = 0, avl; i < n; i += avl) { | ||
| avl = __riscv_vsetvl_e16m8(n - i); | ||
| vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl); | ||
| vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl); | ||
| vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl); | ||
| __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl); | ||
| const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v); | ||
| const _Float16 scale = *(const _Float16*)(&s); | ||
|
|
||
| // calculate step size | ||
| const int epr = __riscv_vsetvlmax_e16m4(); | ||
| const int step = epr * 2; | ||
| int np = (n & ~(step - 1)); | ||
|
|
||
| // unroll by 2 | ||
| for (int i = 0; i < np; i += step) { | ||
| vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, epr); | ||
| vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr); | ||
| ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, epr); | ||
| __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr); | ||
| __asm__ __volatile__ ("" ::: "memory"); | ||
|
|
||
| vfloat16m4_t ax1 = __riscv_vle16_v_f16m4((const _Float16*)x + i + epr, epr); | ||
| vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr); | ||
| ay1 = __riscv_vfmacc_vf_f16m4(ay1, scale, ax1, epr); | ||
| __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr); | ||
| __asm__ __volatile__ ("" ::: "memory"); | ||
| } | ||
|
|
||
| // leftovers | ||
| int vl; | ||
| for (int i = np; i < n; i += vl) { | ||
| vl = __riscv_vsetvl_e16m4(n - i); | ||
| vfloat16m4_t ax0 = __riscv_vle16_v_f16m4((const _Float16*)x + i, vl); | ||
| vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl); | ||
| ay0 = __riscv_vfmacc_vf_f16m4(ay0, scale, ax0, vl); | ||
| __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl); | ||
| } | ||
| np = n; | ||
| #elif defined(GGML_SIMD) | ||
| const int np = (n & ~(GGML_F16_STEP - 1)); | ||
|
|
||
|
|
@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float | |
| svst1_f16(pg, (__fp16 *)(y + np), out); | ||
| } | ||
| #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh) | ||
| for (int i = 0, vl; i < n; i += vl) { | ||
| vl = __riscv_vsetvl_e16m2(n - i); | ||
| vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl); | ||
| vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl); | ||
| vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl); | ||
| vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl); | ||
| __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl); | ||
| const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16(v); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I admire the commitment to remove the unnecessary float32 proxy and use float16 directly, but using RVV merely to emulate fixed-length SIMD seems like a missed opportunity for elegance. It would be delightful to see an implementation that actually leverages the hardware's native agility. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I understand what you mean here? Would you rather have the |
||
| const _Float16 scale = *(const _Float16*)(&s); | ||
|
|
||
| // calculate step size | ||
| const int epr = __riscv_vsetvlmax_e16m4(); | ||
| const int step = epr * 2; | ||
| const int np = (n & ~(step - 1)); | ||
|
|
||
| // unroll by 2 | ||
| for (int i = 0; i < np; i += step) { | ||
| vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, epr); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://docs.google.com/presentation/d/1Vrb4qt8YBt0pbiOA4-z2XcIcZIbLwizJa7-s5DclGpo/edit?slide=id.g39983ae8256_0_35#slide=id.g39983ae8256_0_35 for the numbers and why the choice of |
||
| ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, epr); | ||
| __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, epr); | ||
| __asm__ __volatile__ ("" ::: "memory"); | ||
|
|
||
| vfloat16m4_t ay1 = __riscv_vle16_v_f16m4((const _Float16*)y + i + epr, epr); | ||
| ay1 = __riscv_vfmul_vf_f16m4(ay1, scale, epr); | ||
| __riscv_vse16_v_f16m4((_Float16*)y + i + epr, ay1, epr); | ||
| __asm__ __volatile__ ("" ::: "memory"); | ||
| } | ||
|
|
||
| // leftovers | ||
| int vl; | ||
| for (int i = np; i < n; i += vl) { | ||
| vl = __riscv_vsetvl_e16m4(n - i); | ||
| vfloat16m4_t ay0 = __riscv_vle16_v_f16m4((const _Float16*)y + i, vl); | ||
| ay0 = __riscv_vfmul_vf_f16m4(ay0, scale, vl); | ||
| __riscv_vse16_v_f16m4((_Float16*)y + i, ay0, vl); | ||
| } | ||
| #elif defined(GGML_SIMD) | ||
| const int np = (n & ~(GGML_F16_STEP - 1)); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason not to use
f16m4->f32m8directly, rather than manual unrolling?"There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unroll by 2 is what yielded the best results: https://docs.google.com/presentation/d/1Vrb4qt8YBt0pbiOA4-z2XcIcZIbLwizJa7-s5DclGpo/edit?slide=id.g39983ae8256_0_47#slide=id.g39983ae8256_0_47
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
riseproject-dev#1
Decisions around LMUL and unrolling are a result of the bench-marking numbers summarized in the above PR. We bench-marked at various LMUL and unrolling configurations, as well as preventing the compiler from re-arranging any load accesses, etc. These permutations were tested on cache hot and cache cold numbers, with cache hot numbers prioritized.