Skip to content

Commit 19757e1

Browse files
committed
aarch64: Optimize vector rotates as vector permutes where possible
Some vector rotate operations can be implemented in a single instruction rather than using the fallback SHL+USRA sequence. In particular, when the rotate amount is half the bitwidth of the element we can use a REV64,REV32,REV16 instruction. More generally, rotates by a byte amount can be implented using vector permutes. This patch adds such a generic routine in expmed.cc called expand_rotate_as_vec_perm that calculates the required permute indices and uses the expand_vec_perm_const interface. On aarch64 this ends up generating the single-instruction sequences above where possible and can use LDR+TBL sequences too, which are a good choice. With help from Richard, the routine should be VLA-safe. However, the only use of expand_rotate_as_vec_perm introduced in this patch is in aarch64-specific code that for now only handles fixed-width modes. A runtime aarch64 test is added to ensure the permute indices are not messed up. Bootstrapped and tested on aarch64-none-linux-gnu. Signed-off-by: Kyrylo Tkachov <[email protected]> gcc/ * expmed.h (expand_rotate_as_vec_perm): Declare. * expmed.cc (expand_rotate_as_vec_perm): Define. * config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate): Declare prototype. * config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement. * config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm<mode>): Call the above. gcc/testsuite/ * gcc.target/aarch64/vec-rot-exec.c: New test. * gcc.target/aarch64/simd/pr117048_2.c: New test.
1 parent 1411d39 commit 19757e1

File tree

7 files changed

+232
-0
lines changed

7 files changed

+232
-0
lines changed

gcc/config/aarch64/aarch64-protos.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -851,6 +851,7 @@ bool aarch64_rnd_imm_p (rtx);
851851
bool aarch64_constant_address_p (rtx);
852852
bool aarch64_emit_approx_div (rtx, rtx, rtx);
853853
bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
854+
bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
854855
tree aarch64_vector_load_decl (tree);
855856
rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
856857
void aarch64_expand_call (rtx, rtx, rtx, bool);

gcc/config/aarch64/aarch64-simd.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1313,6 +1313,9 @@
13131313
(match_dup 4))
13141314
(match_dup 3)))]
13151315
{
1316+
if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
1317+
DONE;
1318+
13161319
operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
13171320
rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
13181321
int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;

gcc/config/aarch64/aarch64.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16018,6 +16018,22 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
1601816018
return true;
1601916019
}
1602016020

16021+
/* Emit an optimized sequence to perform a vector rotate
16022+
of REG by the vector constant amount AMNT and place the result
16023+
in DST. Return true iff successful. */
16024+
16025+
bool
16026+
aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
16027+
{
16028+
machine_mode mode = GET_MODE (reg);
16029+
/* Attempt to expand the rotate as a vector permute.
16030+
For some rotate amounts they can be single instructions and
16031+
even the general single-vector TBL permute has good throughput. */
16032+
if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
16033+
return true;
16034+
return false;
16035+
}
16036+
1602116037
/* Return the number of instructions that can be issued per cycle. */
1602216038
static int
1602316039
aarch64_sched_issue_rate (void)

gcc/expmed.cc

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6286,6 +6286,50 @@ emit_store_flag_force (rtx target, enum rtx_code code, rtx op0, rtx op1,
62866286
return target;
62876287
}
62886288

6289+
/* Expand a vector (left) rotate of MODE of X by an immediate AMT as a vector
6290+
permute operation. Emit code to put the result in DST if successfull and
6291+
return it. Otherwise return NULL. This is intended to implement vector
6292+
rotates by byte amounts using vector permutes when the target does not offer
6293+
native vector rotate operations. */
6294+
rtx
6295+
expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt)
6296+
{
6297+
rtx amt_unwrap = unwrap_const_vec_duplicate (amt);
6298+
/* For now handle only rotate by the same integer constant in all lanes.
6299+
In principle rotates by any constant vector are representable through
6300+
permutes as long as the individual rotate amounts are multiples of
6301+
BITS_PER_UNIT. */
6302+
if (!CONST_INT_P (amt_unwrap))
6303+
return NULL_RTX;
6304+
6305+
int rotamnt = INTVAL (amt_unwrap);
6306+
if (rotamnt % BITS_PER_UNIT != 0)
6307+
return NULL_RTX;
6308+
machine_mode qimode;
6309+
if (!qimode_for_vec_perm (mode).exists (&qimode))
6310+
return NULL_RTX;
6311+
6312+
vec_perm_builder builder;
6313+
unsigned nunits = GET_MODE_SIZE (GET_MODE_INNER (mode));
6314+
poly_uint64 total_units = GET_MODE_SIZE (mode);
6315+
builder.new_vector (total_units, nunits, 3);
6316+
unsigned rot_bytes = rotamnt / BITS_PER_UNIT;
6317+
unsigned rot_to_perm = BYTES_BIG_ENDIAN ? rot_bytes : nunits - rot_bytes;
6318+
for (unsigned j = 0; j < 3 * nunits; j += nunits)
6319+
for (unsigned i = 0; i < nunits; i++)
6320+
builder.quick_push ((rot_to_perm + i) % nunits + j);
6321+
6322+
rtx perm_src = lowpart_subreg (qimode, x, mode);
6323+
rtx perm_dst = lowpart_subreg (qimode, dst, mode);
6324+
rtx res
6325+
= expand_vec_perm_const (qimode, perm_src, perm_src, builder,
6326+
qimode, perm_dst);
6327+
if (!res)
6328+
return NULL_RTX;
6329+
emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
6330+
return dst;
6331+
}
6332+
62896333
/* Helper function for canonicalize_cmp_for_target. Swap between inclusive
62906334
and exclusive ranges in order to create an equivalent comparison. See
62916335
canonicalize_cmp_for_target for the possible cases. */

gcc/expmed.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,5 +726,6 @@ extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx,
726726
rtx, int);
727727
extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx,
728728
int, int);
729+
extern rtx expand_rotate_as_vec_perm (machine_mode, rtx, rtx, rtx);
729730

730731
#endif // EXPMED_H
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
/* { dg-do compile } */
2+
/* { dg-options "-O2 -mlittle-endian" } */
3+
/* { dg-final { check-function-bodies "**" "" "" } } */
4+
5+
typedef char __attribute__ ((vector_size (16))) v16qi;
6+
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
7+
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
8+
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
9+
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
10+
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
11+
12+
/*
13+
** G1:
14+
** rev64 v0\.4s, v0\.4s
15+
** ret
16+
*/
17+
v2di
18+
G1 (v2di r)
19+
{
20+
return (r >> 32) | (r << 32);
21+
}
22+
23+
/*
24+
** G2:
25+
** rev32 v0\.8h, v0\.8h
26+
** ret
27+
*/
28+
v4si
29+
G2 (v4si r)
30+
{
31+
return (r >> 16) | (r << 16);
32+
}
33+
34+
/*
35+
** G3:
36+
** rev16 v0\.16b, v0\.16b
37+
** ret
38+
*/
39+
v8hi
40+
G3 (v8hi r)
41+
{
42+
return (r >> 8) | (r << 8);
43+
}
44+
45+
/*
46+
** G4:
47+
** rev32 v0\.4h, v0\.4h
48+
** ret
49+
*/
50+
v2si
51+
G4 (v2si r)
52+
{
53+
return (r >> 16) | (r << 16);
54+
}
55+
56+
/*
57+
** G5:
58+
** rev16 v0\.8b, v0\.8b
59+
** ret
60+
*/
61+
v4hi
62+
G5 (v4hi r)
63+
{
64+
return (r >> 8) | (r << 8);
65+
}
66+
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/* { dg-do run } */
2+
/* { dg-options "-O2" } */
3+
4+
typedef char __attribute__ ((vector_size (16))) v16qi;
5+
typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
6+
typedef unsigned int __attribute__ ((vector_size (16))) v4si;
7+
typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
8+
typedef char __attribute__ ((vector_size (8))) v8qi;
9+
typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
10+
typedef unsigned int __attribute__ ((vector_size (8))) v2si;
11+
#define VEC_ELTS(X) (sizeof (X) / (sizeof (X[0])))
12+
13+
static const char __attribute__ ((aligned (16))) *str = "abcdefghijklmnopqrstuvwxyz";
14+
15+
unsigned long long
16+
__attribute__((noipa,noinline))
17+
rot_64_one (unsigned long long x, unsigned amt)
18+
{
19+
return (x << amt) | (x >> (64 - amt));
20+
}
21+
unsigned int
22+
__attribute__((noipa,noinline))
23+
rot_32_one (unsigned int x, unsigned amt)
24+
{
25+
return (x << amt) | (x >> (32 - amt));
26+
}
27+
28+
unsigned short
29+
__attribute__((noipa,noinline))
30+
rot_16_one (unsigned short x, unsigned short amt)
31+
{
32+
return (x << amt) | (x >> (16 - amt));
33+
}
34+
35+
36+
#define ROTFUNC(M,S,A) \
37+
M \
38+
__attribute__((noipa,noinline)) \
39+
rot_##M##_##S##_##A (M x) \
40+
{ \
41+
return (x << A) | (x >> (S - A)); \
42+
} \
43+
\
44+
void \
45+
test_rot_##M##_##S##_##A (void) \
46+
{ \
47+
M vec = *(M *)str; \
48+
M res = rot_##M##_##S##_##A (vec); \
49+
for (__SIZE_TYPE__ i = 0; i < VEC_ELTS (vec); i++) \
50+
if (res[i] != rot_##S##_one (vec[i], A)) \
51+
__builtin_abort (); \
52+
}
53+
54+
ROTFUNC (v2di, 64, 56)
55+
ROTFUNC (v2di, 64, 48)
56+
ROTFUNC (v2di, 64, 40)
57+
ROTFUNC (v2di, 64, 32)
58+
ROTFUNC (v2di, 64, 24)
59+
ROTFUNC (v2di, 64, 16)
60+
ROTFUNC (v2di, 64, 8)
61+
62+
ROTFUNC (v4si, 32, 24)
63+
ROTFUNC (v4si, 32, 16)
64+
ROTFUNC (v4si, 32, 8)
65+
66+
ROTFUNC (v8hi, 16, 8)
67+
68+
ROTFUNC (v2si, 32, 24)
69+
ROTFUNC (v2si, 32, 16)
70+
ROTFUNC (v2si, 32, 8)
71+
72+
ROTFUNC (v4hi, 16, 8)
73+
74+
#define CALL_TEST(M,S,A) test_rot_##M##_##S##_##A ()
75+
76+
int
77+
main (void)
78+
{
79+
CALL_TEST (v2di, 64, 56);
80+
CALL_TEST (v2di, 64, 48);
81+
CALL_TEST (v2di, 64, 40);
82+
CALL_TEST (v2di, 64, 32);
83+
CALL_TEST (v2di, 64, 24);
84+
CALL_TEST (v2di, 64, 16);
85+
CALL_TEST (v2di, 64, 8);
86+
87+
CALL_TEST (v4si, 32, 24);
88+
CALL_TEST (v4si, 32, 16);
89+
CALL_TEST (v4si, 32, 8);
90+
91+
CALL_TEST (v8hi, 16, 8);
92+
93+
CALL_TEST (v2si, 32, 24);
94+
CALL_TEST (v2si, 32, 16);
95+
CALL_TEST (v2si, 32, 8);
96+
97+
CALL_TEST (v4hi, 16, 8);
98+
99+
return 0;
100+
}
101+

0 commit comments

Comments
 (0)