Skip to content

Commit d7250b6

Browse files
author
Christophe Lyon
committed
arm: [MVE intrinsics] rework vddup vidup
Implement vddup and vidup using the new MVE builtins framework. We generate better code because we take advantage of the two outputs produced by the v[id]dup instructions. For instance, before: ldr r3, [r0] sub r2, r3, #8 str r2, [r0] mov r2, r3 vddup.u16 q3, r2, #1 now: ldr r2, [r0] vddup.u16 q3, r2, #1 str r2, [r0] 2024-08-21 Christophe Lyon <[email protected]> gcc/ * config/arm/arm-mve-builtins-base.cc (class viddup_impl): New. (vddup): New. (vidup): New. * config/arm/arm-mve-builtins-base.def (vddupq): New. (vidupq): New. * config/arm/arm-mve-builtins-base.h (vddupq): New. (vidupq): New. * config/arm/arm_mve.h (vddupq_m): Delete. (vddupq_u8): Delete. (vddupq_u32): Delete. (vddupq_u16): Delete. (vidupq_m): Delete. (vidupq_u8): Delete. (vidupq_u32): Delete. (vidupq_u16): Delete. (vddupq_x_u8): Delete. (vddupq_x_u16): Delete. (vddupq_x_u32): Delete. (vidupq_x_u8): Delete. (vidupq_x_u16): Delete. (vidupq_x_u32): Delete. (vddupq_m_n_u8): Delete. (vddupq_m_n_u32): Delete. (vddupq_m_n_u16): Delete. (vddupq_m_wb_u8): Delete. (vddupq_m_wb_u16): Delete. (vddupq_m_wb_u32): Delete. (vddupq_n_u8): Delete. (vddupq_n_u32): Delete. (vddupq_n_u16): Delete. (vddupq_wb_u8): Delete. (vddupq_wb_u16): Delete. (vddupq_wb_u32): Delete. (vidupq_m_n_u8): Delete. (vidupq_m_n_u32): Delete. (vidupq_m_n_u16): Delete. (vidupq_m_wb_u8): Delete. (vidupq_m_wb_u16): Delete. (vidupq_m_wb_u32): Delete. (vidupq_n_u8): Delete. (vidupq_n_u32): Delete. (vidupq_n_u16): Delete. (vidupq_wb_u8): Delete. (vidupq_wb_u16): Delete. (vidupq_wb_u32): Delete. (vddupq_x_n_u8): Delete. (vddupq_x_n_u16): Delete. (vddupq_x_n_u32): Delete. (vddupq_x_wb_u8): Delete. (vddupq_x_wb_u16): Delete. (vddupq_x_wb_u32): Delete. (vidupq_x_n_u8): Delete. (vidupq_x_n_u16): Delete. (vidupq_x_n_u32): Delete. (vidupq_x_wb_u8): Delete. (vidupq_x_wb_u16): Delete. (vidupq_x_wb_u32): Delete. (__arm_vddupq_m_n_u8): Delete. (__arm_vddupq_m_n_u32): Delete. (__arm_vddupq_m_n_u16): Delete. (__arm_vddupq_m_wb_u8): Delete. (__arm_vddupq_m_wb_u16): Delete. (__arm_vddupq_m_wb_u32): Delete. (__arm_vddupq_n_u8): Delete. (__arm_vddupq_n_u32): Delete. (__arm_vddupq_n_u16): Delete. (__arm_vidupq_m_n_u8): Delete. (__arm_vidupq_m_n_u32): Delete. (__arm_vidupq_m_n_u16): Delete. (__arm_vidupq_n_u8): Delete. (__arm_vidupq_m_wb_u8): Delete. (__arm_vidupq_m_wb_u16): Delete. (__arm_vidupq_m_wb_u32): Delete. (__arm_vidupq_n_u32): Delete. (__arm_vidupq_n_u16): Delete. (__arm_vidupq_wb_u8): Delete. (__arm_vidupq_wb_u16): Delete. (__arm_vidupq_wb_u32): Delete. (__arm_vddupq_wb_u8): Delete. (__arm_vddupq_wb_u16): Delete. (__arm_vddupq_wb_u32): Delete. (__arm_vddupq_x_n_u8): Delete. (__arm_vddupq_x_n_u16): Delete. (__arm_vddupq_x_n_u32): Delete. (__arm_vddupq_x_wb_u8): Delete. (__arm_vddupq_x_wb_u16): Delete. (__arm_vddupq_x_wb_u32): Delete. (__arm_vidupq_x_n_u8): Delete. (__arm_vidupq_x_n_u16): Delete. (__arm_vidupq_x_n_u32): Delete. (__arm_vidupq_x_wb_u8): Delete. (__arm_vidupq_x_wb_u16): Delete. (__arm_vidupq_x_wb_u32): Delete. (__arm_vddupq_m): Delete. (__arm_vddupq_u8): Delete. (__arm_vddupq_u32): Delete. (__arm_vddupq_u16): Delete. (__arm_vidupq_m): Delete. (__arm_vidupq_u8): Delete. (__arm_vidupq_u32): Delete. (__arm_vidupq_u16): Delete. (__arm_vddupq_x_u8): Delete. (__arm_vddupq_x_u16): Delete. (__arm_vddupq_x_u32): Delete. (__arm_vidupq_x_u8): Delete. (__arm_vidupq_x_u16): Delete. (__arm_vidupq_x_u32): Delete.
1 parent e38566a commit d7250b6

File tree

4 files changed

+116
-676
lines changed

4 files changed

+116
-676
lines changed

gcc/config/arm/arm-mve-builtins-base.cc

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "basic-block.h"
3131
#include "function.h"
3232
#include "gimple.h"
33+
#include "emit-rtl.h"
3334
#include "arm-mve-builtins.h"
3435
#include "arm-mve-builtins-shapes.h"
3536
#include "arm-mve-builtins-base.h"
@@ -402,6 +403,115 @@ class vcvtxq_impl : public function_base
402403
}
403404
};
404405

406+
/* Map the vidup / vddup function directly to CODE (UNSPEC, M) where M is the
407+
vector mode associated with type suffix 0. We need this special case
408+
because in MODE_wb the builtins derefrence the first parameter and update
409+
its contents. We also have to insert the two additional parameters needed
410+
by the builtins compared to the intrinsics. */
411+
class viddup_impl : public function_base
412+
{
413+
public:
414+
CONSTEXPR viddup_impl (bool inc_dec)
415+
: m_inc_dec (inc_dec)
416+
{}
417+
418+
/* Increment (true) or decrement (false). */
419+
bool m_inc_dec;
420+
421+
unsigned int
422+
call_properties (const function_instance &fi) const override
423+
{
424+
if (fi.mode_suffix_id == MODE_wb)
425+
return CP_WRITE_MEMORY | CP_READ_MEMORY;
426+
else
427+
return 0;
428+
}
429+
430+
tree
431+
memory_scalar_type (const function_instance &) const override
432+
{
433+
return get_typenode_from_name (UINT32_TYPE);
434+
}
435+
436+
rtx
437+
expand (function_expander &e) const override
438+
{
439+
machine_mode mode = e.vector_mode (0);
440+
insn_code code;
441+
rtx insns, offset_ptr;
442+
rtx new_offset;
443+
int offset_arg_no;
444+
rtx incr, total_incr;
445+
446+
if (! e.type_suffix (0).integer_p)
447+
gcc_unreachable ();
448+
449+
if ((e.mode_suffix_id != MODE_n)
450+
&& (e.mode_suffix_id != MODE_wb))
451+
gcc_unreachable ();
452+
453+
offset_arg_no = (e.pred == PRED_m) ? 1 : 0;
454+
455+
/* In _wb mode, the start offset is passed via a pointer,
456+
dereference it. */
457+
if (e.mode_suffix_id == MODE_wb)
458+
{
459+
rtx offset = gen_reg_rtx (SImode);
460+
offset_ptr = e.args[offset_arg_no];
461+
emit_insn (gen_rtx_SET (offset, gen_rtx_MEM (SImode, offset_ptr)));
462+
e.args[offset_arg_no] = offset;
463+
}
464+
465+
/* We have to shuffle parameters because the builtin needs additional
466+
arguments:
467+
- the updated "new_offset"
468+
- total increment (incr * number of lanes) */
469+
new_offset = gen_reg_rtx (SImode);
470+
e.args.quick_insert (offset_arg_no, new_offset);
471+
472+
incr = e.args[offset_arg_no + 2];
473+
total_incr = gen_int_mode (INTVAL (incr)
474+
* GET_MODE_NUNITS (e.vector_mode (0)),
475+
SImode);
476+
e.args.quick_push (total_incr);
477+
478+
/* _wb mode uses the _n builtins and adds code to update the
479+
offset. */
480+
switch (e.pred)
481+
{
482+
case PRED_none:
483+
/* No predicate. */
484+
code = m_inc_dec
485+
? code_for_mve_q_u_insn (VIDUPQ, mode)
486+
: code_for_mve_q_u_insn (VDDUPQ, mode);
487+
insns = e.use_exact_insn (code);
488+
break;
489+
490+
case PRED_m:
491+
case PRED_x:
492+
/* "m" or "x" predicate. */
493+
code = m_inc_dec
494+
? code_for_mve_q_m_wb_u_insn (VIDUPQ_M, mode)
495+
: code_for_mve_q_m_wb_u_insn (VDDUPQ_M, mode);
496+
497+
if (e.pred == PRED_m)
498+
insns = e.use_cond_insn (code, 0);
499+
else
500+
insns = e.use_pred_x_insn (code);
501+
break;
502+
503+
default:
504+
gcc_unreachable ();
505+
}
506+
507+
/* Update offset as appropriate. */
508+
if (e.mode_suffix_id == MODE_wb)
509+
emit_insn (gen_rtx_SET (gen_rtx_MEM (Pmode, offset_ptr), new_offset));
510+
511+
return insns;
512+
}
513+
};
514+
405515
} /* end anonymous namespace */
406516

407517
namespace arm_mve {
@@ -614,7 +724,9 @@ FUNCTION_WITHOUT_N_NO_F (vcvtmq, VCVTMQ)
614724
FUNCTION_WITHOUT_N_NO_F (vcvtnq, VCVTNQ)
615725
FUNCTION_WITHOUT_N_NO_F (vcvtpq, VCVTPQ)
616726
FUNCTION (vcvttq, vcvtxq_impl, (VCVTTQ_F16_F32, VCVTTQ_M_F16_F32, VCVTTQ_F32_F16, VCVTTQ_M_F32_F16))
727+
FUNCTION (vddupq, viddup_impl, (false))
617728
FUNCTION (vdupq, vdupq_impl, (VDUPQ_M_N_S, VDUPQ_M_N_U, VDUPQ_M_N_F))
729+
FUNCTION (vidupq, viddup_impl, (true))
618730
FUNCTION_WITH_RTX_M (veorq, XOR, VEORQ)
619731
FUNCTION (vfmaq, unspec_mve_function_exact_insn, (-1, -1, VFMAQ_F, -1, -1, VFMAQ_N_F, -1, -1, VFMAQ_M_F, -1, -1, VFMAQ_M_N_F))
620732
FUNCTION (vfmasq, unspec_mve_function_exact_insn, (-1, -1, -1, -1, -1, VFMASQ_N_F, -1, -1, -1, -1, -1, VFMASQ_M_N_F))

gcc/config/arm/arm-mve-builtins-base.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,14 @@ DEF_MVE_FUNCTION (vctp16q, vctp, none, m_or_none)
4646
DEF_MVE_FUNCTION (vctp32q, vctp, none, m_or_none)
4747
DEF_MVE_FUNCTION (vctp64q, vctp, none, m_or_none)
4848
DEF_MVE_FUNCTION (vctp8q, vctp, none, m_or_none)
49+
DEF_MVE_FUNCTION (vddupq, viddup, all_unsigned, mx_or_none)
4950
DEF_MVE_FUNCTION (vdupq, unary_n, all_integer, mx_or_none)
5051
DEF_MVE_FUNCTION (veorq, binary, all_integer, mx_or_none)
5152
DEF_MVE_FUNCTION (vhaddq, binary_opt_n, all_integer, mx_or_none)
5253
DEF_MVE_FUNCTION (vhcaddq_rot270, binary, all_signed, mx_or_none)
5354
DEF_MVE_FUNCTION (vhcaddq_rot90, binary, all_signed, mx_or_none)
5455
DEF_MVE_FUNCTION (vhsubq, binary_opt_n, all_integer, mx_or_none)
56+
DEF_MVE_FUNCTION (vidupq, viddup, all_unsigned, mx_or_none)
5557
DEF_MVE_FUNCTION (vld1q, load, all_integer, none)
5658
DEF_MVE_FUNCTION (vmaxaq, binary_maxamina, all_signed, m_or_none)
5759
DEF_MVE_FUNCTION (vmaxavq, binary_maxavminav, all_signed, p_or_none)

gcc/config/arm/arm-mve-builtins-base.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ extern const function_base *const vcvtnq;
6666
extern const function_base *const vcvtpq;
6767
extern const function_base *const vcvtq;
6868
extern const function_base *const vcvttq;
69+
extern const function_base *const vddupq;
6970
extern const function_base *const vdupq;
7071
extern const function_base *const veorq;
7172
extern const function_base *const vfmaq;
@@ -75,6 +76,7 @@ extern const function_base *const vhaddq;
7576
extern const function_base *const vhcaddq_rot270;
7677
extern const function_base *const vhcaddq_rot90;
7778
extern const function_base *const vhsubq;
79+
extern const function_base *const vidupq;
7880
extern const function_base *const vld1q;
7981
extern const function_base *const vmaxaq;
8082
extern const function_base *const vmaxavq;

0 commit comments

Comments
 (0)