Skip to content

Commit b01e834

Browse files
committed
aarch64: Optimize SVE encode functions to use peak-performance vector combinations
Update both ec_encode_data_sve() and ec_encode_data_sve2() to use optimal 4 and 5 vector combinations based on benchmark results showing these achieve the highest performance. Key optimizations: - Loop over 4-vector operations when rows > 7 (peak performance) - Use 4+3 combination for 7 vectors instead of single 7-vector call - Use 4+2 combination for 6 vectors instead of single 6-vector call - Keep 5-vector for 5 vectors (second-best performance) - Applies to both SVE and SVE2 variants for consistent optimization This leverages the benchmark findings that 4 and 5 vector operations achieve 40+ GB/s performance, significantly better than 6-7 vector operations which drop to 30-36 GB/s. Signed-off-by: Jonathan Swinney <[email protected]>
1 parent a00e9db commit b01e834

File tree

1 file changed

+26
-66
lines changed

1 file changed

+26
-66
lines changed

erasure_code/aarch64/ec_aarch64_highlevel_func.c

Lines changed: 26 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -214,47 +214,27 @@ ec_encode_data_sve(int len, int k, int rows, unsigned char *g_tbls, unsigned cha
214214
return;
215215
}
216216

217-
while (rows > 11) {
218-
gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
219-
g_tbls += 6 * k * 32;
220-
coding += 6;
221-
rows -= 6;
217+
while (rows > 7) {
218+
gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
219+
g_tbls += 4 * k * 32;
220+
coding += 4;
221+
rows -= 4;
222222
}
223223

224224
switch (rows) {
225-
case 11:
226-
/* 7 + 4 */
227-
gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding);
228-
g_tbls += 7 * k * 32;
229-
coding += 7;
230-
gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
231-
break;
232-
case 10:
233-
/* 6 + 4 */
234-
gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
235-
g_tbls += 6 * k * 32;
236-
coding += 6;
237-
gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
238-
break;
239-
case 9:
240-
/* 5 + 4 */
241-
gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding);
242-
g_tbls += 5 * k * 32;
243-
coding += 5;
244-
gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
245-
break;
246-
case 8:
247-
/* 4 + 4 */
225+
case 7:
226+
/* 4 + 3 */
248227
gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
249228
g_tbls += 4 * k * 32;
250229
coding += 4;
251-
gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
252-
break;
253-
case 7:
254-
gf_7vect_dot_prod_sve(len, k, g_tbls, data, coding);
230+
gf_3vect_dot_prod_sve(len, k, g_tbls, data, coding);
255231
break;
256232
case 6:
257-
gf_6vect_dot_prod_sve(len, k, g_tbls, data, coding);
233+
/* 4 + 2 */
234+
gf_4vect_dot_prod_sve(len, k, g_tbls, data, coding);
235+
g_tbls += 4 * k * 32;
236+
coding += 4;
237+
gf_2vect_dot_prod_sve(len, k, g_tbls, data, coding);
258238
break;
259239
case 5:
260240
gf_5vect_dot_prod_sve(len, k, g_tbls, data, coding);
@@ -285,47 +265,27 @@ ec_encode_data_sve2(int len, int k, int rows, unsigned char *g_tbls, unsigned ch
285265
return;
286266
}
287267

288-
while (rows > 11) {
289-
gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding);
290-
g_tbls += 6 * k * 32;
291-
coding += 6;
292-
rows -= 6;
268+
while (rows > 7) {
269+
gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding);
270+
g_tbls += 4 * k * 32;
271+
coding += 4;
272+
rows -= 4;
293273
}
294274

295275
switch (rows) {
296-
case 11:
297-
/* 7 + 4 */
298-
gf_7vect_dot_prod_sve2(len, k, g_tbls, data, coding);
299-
g_tbls += 7 * k * 32;
300-
coding += 7;
301-
gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding);
302-
break;
303-
case 10:
304-
/* 6 + 4 */
305-
gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding);
306-
g_tbls += 6 * k * 32;
307-
coding += 6;
308-
gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding);
309-
break;
310-
case 9:
311-
/* 5 + 4 */
312-
gf_5vect_dot_prod_sve2(len, k, g_tbls, data, coding);
313-
g_tbls += 5 * k * 32;
314-
coding += 5;
315-
gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding);
316-
break;
317-
case 8:
318-
/* 4 + 4 */
276+
case 7:
277+
/* 4 + 3 */
319278
gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding);
320279
g_tbls += 4 * k * 32;
321280
coding += 4;
322-
gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding);
323-
break;
324-
case 7:
325-
gf_7vect_dot_prod_sve2(len, k, g_tbls, data, coding);
281+
gf_3vect_dot_prod_sve2(len, k, g_tbls, data, coding);
326282
break;
327283
case 6:
328-
gf_6vect_dot_prod_sve2(len, k, g_tbls, data, coding);
284+
/* 4 + 2 */
285+
gf_4vect_dot_prod_sve2(len, k, g_tbls, data, coding);
286+
g_tbls += 4 * k * 32;
287+
coding += 4;
288+
gf_2vect_dot_prod_sve2(len, k, g_tbls, data, coding);
329289
break;
330290
case 5:
331291
gf_5vect_dot_prod_sve2(len, k, g_tbls, data, coding);

0 commit comments

Comments
 (0)