aarch64: Optimize vector rotates as vector permutes where possible

ktkachov · ktkachov · commit 19757e1c28de · 2024-11-04T09:41:09.000+01:00
Some vector rotate operations can be implemented in a single instruction
rather than using the fallback SHL+USRA sequence.
In particular, when the rotate amount is half the bitwidth of the element
we can use a REV64,REV32,REV16 instruction.
More generally, rotates by a byte amount can be implented using vector
permutes.
This patch adds such a generic routine in expmed.cc called
expand_rotate_as_vec_perm that calculates the required permute indices
and uses the expand_vec_perm_const interface.

On aarch64 this ends up generating the single-instruction sequences above
where possible and can use LDR+TBL sequences too, which are a good choice.

With help from Richard, the routine should be VLA-safe.
However, the only use of expand_rotate_as_vec_perm introduced in this patch
is in aarch64-specific code that for now only handles fixed-width modes.

A runtime aarch64 test is added to ensure the permute indices are not messed
up.

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov &lt;ktkachov@nvidia.com&gt;

gcc/

	* expmed.h (expand_rotate_as_vec_perm): Declare.
	* expmed.cc (expand_rotate_as_vec_perm): Define.
	* config/aarch64/aarch64-protos.h (aarch64_emit_opt_vec_rotate):
	Declare prototype.
	* config/aarch64/aarch64.cc (aarch64_emit_opt_vec_rotate): Implement.
	* config/aarch64/aarch64-simd.md (*aarch64_simd_rotate_imm&lt;mode&gt;):
	Call the above.

gcc/testsuite/

	* gcc.target/aarch64/vec-rot-exec.c: New test.
	* gcc.target/aarch64/simd/pr117048_2.c: New test.
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
@@ -851,6 +851,7 @@ bool aarch64_rnd_imm_p (rtx);
 bool aarch64_constant_address_p (rtx);
 bool aarch64_emit_approx_div (rtx, rtx, rtx);
 bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
+bool aarch64_emit_opt_vec_rotate (rtx, rtx, rtx);
 tree aarch64_vector_load_decl (tree);
 rtx aarch64_gen_callee_cookie (aarch64_isa_mode, arm_pcs);
 void aarch64_expand_call (rtx, rtx, rtx, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
@@ -1313,6 +1313,9 @@
 	    (match_dup 4))
 	  (match_dup 3)))]
   {
+    if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2]))
+      DONE;
+
     operands[3] = reload_completed ? operands[0] : gen_reg_rtx (<MODE>mode);
     rtx shft_amnt = unwrap_const_vec_duplicate (operands[2]);
     int bitwidth = GET_MODE_UNIT_SIZE (<MODE>mode) * BITS_PER_UNIT;
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
@@ -16018,6 +16018,22 @@ aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
   return true;
 }
 
+/* Emit an optimized sequence to perform a vector rotate
+   of REG by the vector constant amount AMNT and place the result
+   in DST.  Return true iff successful.  */
+
+bool
+aarch64_emit_opt_vec_rotate (rtx dst, rtx reg, rtx amnt)
+{
+  machine_mode mode = GET_MODE (reg);
+  /* Attempt to expand the rotate as a vector permute.
+     For some rotate amounts they can be single instructions and
+     even the general single-vector TBL permute has good throughput.  */
+  if (expand_rotate_as_vec_perm (mode, dst, reg, amnt))
+    return true;
+  return false;
+}
+
 /* Return the number of instructions that can be issued per cycle.  */
 static int
 aarch64_sched_issue_rate (void)
diff --git a/gcc/expmed.cc b/gcc/expmed.cc
@@ -6286,6 +6286,50 @@ emit_store_flag_force (rtx target, enum rtx_code code, rtx op0, rtx op1,
   return target;
 }
 
+/* Expand a vector (left) rotate of MODE of X by an immediate AMT as a vector
+   permute operation.  Emit code to put the result in DST if successfull and
+   return it.  Otherwise return NULL.  This is intended to implement vector
+   rotates by byte amounts using vector permutes when the target does not offer
+   native vector rotate operations.  */
+rtx
+expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt)
+{
+  rtx amt_unwrap = unwrap_const_vec_duplicate (amt);
+  /* For now handle only rotate by the same integer constant in all lanes.
+     In principle rotates by any constant vector are representable through
+     permutes as long as the individual rotate amounts are multiples of
+     BITS_PER_UNIT.  */
+  if (!CONST_INT_P (amt_unwrap))
+    return NULL_RTX;
+
+  int rotamnt = INTVAL (amt_unwrap);
+  if (rotamnt % BITS_PER_UNIT != 0)
+    return NULL_RTX;
+  machine_mode qimode;
+  if (!qimode_for_vec_perm (mode).exists (&qimode))
+    return NULL_RTX;
+
+  vec_perm_builder builder;
+  unsigned nunits = GET_MODE_SIZE (GET_MODE_INNER (mode));
+  poly_uint64 total_units = GET_MODE_SIZE (mode);
+  builder.new_vector (total_units, nunits, 3);
+  unsigned rot_bytes = rotamnt / BITS_PER_UNIT;
+  unsigned rot_to_perm = BYTES_BIG_ENDIAN ? rot_bytes : nunits - rot_bytes;
+  for (unsigned j = 0; j < 3 * nunits; j += nunits)
+    for (unsigned i = 0; i < nunits; i++)
+      builder.quick_push ((rot_to_perm + i) % nunits + j);
+
+  rtx perm_src = lowpart_subreg (qimode, x, mode);
+  rtx perm_dst = lowpart_subreg (qimode, dst, mode);
+  rtx res
+    = expand_vec_perm_const (qimode, perm_src, perm_src, builder,
+			     qimode, perm_dst);
+  if (!res)
+    return NULL_RTX;
+  emit_move_insn (dst, lowpart_subreg (mode, res, qimode));
+  return dst;
+}
+
 /* Helper function for canonicalize_cmp_for_target.  Swap between inclusive
    and exclusive ranges in order to create an equivalent comparison.  See
    canonicalize_cmp_for_target for the possible cases.  */
diff --git a/gcc/expmed.h b/gcc/expmed.h
@@ -726,5 +726,6 @@ extern rtx expand_mult_highpart_adjust (scalar_int_mode, rtx, rtx, rtx,
 					rtx, int);
 extern rtx expmed_mult_highpart_optab (scalar_int_mode, rtx, rtx, rtx,
 				       int, int);
+extern rtx expand_rotate_as_vec_perm (machine_mode, rtx, rtx, rtx);
 
 #endif  // EXPMED_H
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c b/gcc/testsuite/gcc.target/aarch64/simd/pr117048_2.c
@@ -0,0 +1,66 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mlittle-endian" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+
+/*
+** G1:
+**	rev64	v0\.4s, v0\.4s
+**	ret 
+*/
+v2di
+G1 (v2di r)
+{
+  return (r >> 32) | (r << 32);
+}
+
+/*
+** G2:
+**	rev32	v0\.8h, v0\.8h
+**	ret 
+*/
+v4si
+G2 (v4si r)
+{
+  return (r >> 16) | (r << 16);
+}
+
+/*
+** G3:
+**	rev16	v0\.16b, v0\.16b
+**	ret 
+*/
+v8hi
+G3 (v8hi r)
+{
+  return (r >> 8) | (r << 8);
+}
+
+/*
+** G4:
+**	rev32	v0\.4h, v0\.4h
+**	ret 
+*/
+v2si
+G4 (v2si r)
+{
+  return (r >> 16) | (r << 16);
+}
+
+/*
+** G5:
+**	rev16	v0\.8b, v0\.8b
+**	ret 
+*/
+v4hi
+G5 (v4hi r)
+{
+  return (r >> 8) | (r << 8);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c b/gcc/testsuite/gcc.target/aarch64/vec-rot-exec.c
@@ -0,0 +1,101 @@
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+
+typedef char __attribute__ ((vector_size (16))) v16qi;
+typedef unsigned short __attribute__ ((vector_size (16))) v8hi;
+typedef unsigned int __attribute__ ((vector_size (16))) v4si;
+typedef unsigned long long __attribute__ ((vector_size (16))) v2di;
+typedef char __attribute__ ((vector_size (8))) v8qi;
+typedef unsigned short __attribute__ ((vector_size (8))) v4hi;
+typedef unsigned int __attribute__ ((vector_size (8))) v2si;
+#define VEC_ELTS(X) (sizeof (X) / (sizeof (X[0])))
+
+static const char __attribute__ ((aligned (16))) *str = "abcdefghijklmnopqrstuvwxyz";
+
+unsigned long long
+__attribute__((noipa,noinline)) 
+rot_64_one (unsigned long long x, unsigned amt)
+{
+  return (x << amt) | (x >> (64 - amt));
+}
+unsigned int
+__attribute__((noipa,noinline)) 
+rot_32_one (unsigned int x, unsigned amt)
+{
+  return (x << amt) | (x >> (32 - amt));
+}
+
+unsigned short
+__attribute__((noipa,noinline)) 
+rot_16_one (unsigned short x, unsigned short amt)
+{
+  return (x << amt) | (x >> (16 - amt));
+}
+
+
+#define ROTFUNC(M,S,A)					\
+M							\
+__attribute__((noipa,noinline)) 			\
+rot_##M##_##S##_##A (M x)				\
+{							\
+  return (x << A) | (x >> (S - A));			\
+}							\
+							\
+void							\
+test_rot_##M##_##S##_##A (void)				\
+{							\
+  M vec = *(M *)str;					\
+  M res = rot_##M##_##S##_##A (vec);			\
+  for (__SIZE_TYPE__ i = 0; i < VEC_ELTS (vec); i++)	\
+    if (res[i] != rot_##S##_one (vec[i], A))		\
+      __builtin_abort ();				\
+}
+
+ROTFUNC (v2di, 64, 56)
+ROTFUNC (v2di, 64, 48)
+ROTFUNC (v2di, 64, 40)
+ROTFUNC (v2di, 64, 32)
+ROTFUNC (v2di, 64, 24)
+ROTFUNC (v2di, 64, 16)
+ROTFUNC (v2di, 64, 8)
+
+ROTFUNC (v4si, 32, 24)
+ROTFUNC (v4si, 32, 16)
+ROTFUNC (v4si, 32, 8)
+
+ROTFUNC (v8hi, 16, 8)
+
+ROTFUNC (v2si, 32, 24)
+ROTFUNC (v2si, 32, 16)
+ROTFUNC (v2si, 32, 8)
+
+ROTFUNC (v4hi, 16, 8)
+
+#define CALL_TEST(M,S,A) test_rot_##M##_##S##_##A ()
+
+int
+main (void)
+{
+  CALL_TEST (v2di, 64, 56);
+  CALL_TEST (v2di, 64, 48);
+  CALL_TEST (v2di, 64, 40);
+  CALL_TEST (v2di, 64, 32);
+  CALL_TEST (v2di, 64, 24);
+  CALL_TEST (v2di, 64, 16);
+  CALL_TEST (v2di, 64, 8);
+
+  CALL_TEST (v4si, 32, 24);
+  CALL_TEST (v4si, 32, 16);
+  CALL_TEST (v4si, 32, 8);
+
+  CALL_TEST (v8hi, 16, 8);
+
+  CALL_TEST (v2si, 32, 24);
+  CALL_TEST (v2si, 32, 16);
+  CALL_TEST (v2si, 32, 8);
+
+  CALL_TEST (v4hi, 16, 8);
+
+  return 0;
+}
+