Skip to content

Commit cf6b4a4

Browse files
committed
Add sqr, mulhigh and sqrhigh routines for Arm
And cleanup some code for related tests. Added routines are: * flint_mpn_sqr_N for N <= 9, * flint_mpn_mulhigh_N for N <= 8, * flint_mpn_sqrhigh_N for N <= 8, * _flint_mpn_mulhigh_basecase which works for n > 8. Also optimized flint_mpn_mul_8n for Arm.
1 parent 5a88847 commit cf6b4a4

21 files changed

+3750
-365
lines changed

src/mpn_extras.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -247,11 +247,11 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
247247
# define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp))
248248
#elif FLINT_HAVE_ASSEMBLY_armv8
249249
# define FLINT_MPN_MUL_FUNC_N_TAB_WIDTH 15
250-
# define FLINT_MPN_SQR_FUNC_TAB_WIDTH 0
250+
# define FLINT_MPN_SQR_FUNC_TAB_WIDTH 9
251251

252252
# define FLINT_HAVE_MUL_FUNC(n, m) FLINT_HAVE_MUL_N_FUNC(n)
253253
# define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= FLINT_MPN_MUL_FUNC_N_TAB_WIDTH)
254-
# define FLINT_HAVE_SQR_FUNC(n) (0)
254+
# define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH)
255255

256256
# define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_n_tab[xn](rp, xp, yp, yn))
257257
# define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_func_n_tab[n](rp, xp, yp, n))
@@ -417,6 +417,19 @@ FLINT_DLL extern const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_nor
417417
# define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 9
418418
# define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 8
419419
# define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 9
420+
421+
/* NOTE: This function only works for n >= 6 */
422+
# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1
423+
424+
/* NOTE: The x86_64_adx versions of these functions only works for n >= 6 */
425+
# define FLINT_HAVE_NATIVE_mpn_sqrhigh_basecase 1
426+
#elif FLINT_HAVE_ASSEMBLY_armv8
427+
# define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 8
428+
# define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 8
429+
# define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 0
430+
431+
/* NOTE: This function only works for n > 8 */
432+
# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1
420433
#else
421434
# define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 16
422435
# define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 2
@@ -429,8 +442,6 @@ FLINT_DLL extern const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_nor
429442

430443
void _flint_mpn_mulhigh_n_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n);
431444

432-
/* NOTE: The x86_64_adx version of this function only works for n >= 6 */
433-
# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1
434445
mp_limb_t _flint_mpn_mulhigh_basecase(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n);
435446

436447
mp_limb_t _flint_mpn_mulhigh_n_mulders(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n);
@@ -468,9 +479,6 @@ void flint_mpn_mul_or_mulhigh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t
468479
#define FLINT_MPN_SQRHIGH_SQR_CUTOFF 2000
469480
#define FLINT_MPN_SQRHIGH_K_TAB_SIZE 2048
470481

471-
/* NOTE: The x86_64_adx versions of these functions only works for n >= 6 */
472-
# define FLINT_HAVE_NATIVE_mpn_sqrhigh_basecase 1
473-
474482
#if FLINT_HAVE_ASSEMBLY_x86_64_adx
475483

476484
mp_limb_t _flint_mpn_sqrhigh_basecase_even(mp_ptr, mp_srcptr, mp_size_t);
@@ -499,8 +507,6 @@ MPN_EXTRAS_INLINE mp_limb_t _flint_mpn_sqrhigh_basecase(mp_ptr res, mp_srcptr u,
499507

500508
void _flint_mpn_sqrhigh_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_size_t n);
501509
mp_limb_t _flint_mpn_sqrhigh_mulders(mp_ptr res, mp_srcptr u, mp_size_t n);
502-
mp_limb_t _flint_mpn_sqrhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_size_t n);
503-
mp_limb_t flint_mpn_sqrhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_size_t n);
504510
mp_limb_t _flint_mpn_sqrhigh_sqr(mp_ptr res, mp_srcptr u, mp_size_t n);
505511
mp_limb_t _flint_mpn_sqrhigh(mp_ptr, mp_srcptr, mp_size_t);
506512

src/mpn_extras/arm64/arm64-defs.m4

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,15 @@ dnl don't want to disable macro expansions in or after them.
3737
changecom
3838

3939

40-
dnl LEA_HI(reg,gmp_symbol), LEA_LO(reg,gmp_symbol)
40+
dnl LEA_HI(reg,gmp_symbol)
41+
dnl LEA_LO(reg,gmp_symbol)
42+
dnl LOH_ADRPADD(labelhi,labello)
4143
dnl
42-
dnl Load the address of gmp_symbol into a register. We split this into two
43-
dnl parts to allow separation for manual insn scheduling.
44+
dnl Load the address of gmp_symbol into a register. label(hi|lo) has to
45+
dnl be unique labels that LEA_HI and LEA_LO are put on. We split this
46+
dnl into three parts to allow separation for manual insn scheduling.
47+
dnl LOH_ADRPADD has to be called as well, but remains unused on
48+
dnl non-Darwin systems.
4449

4550
ifdef(`PIC',`dnl
4651
define(`LEA_HI', `adrp $1, :got:$2')dnl
@@ -50,4 +55,18 @@ define(`LEA_HI', `adrp $1, $2')dnl
5055
define(`LEA_LO', `add $1, $1, :lo12:$2')dnl
5156
')dnl
5257

58+
define(`LOH_ADRPADD',`')dnl
59+
60+
61+
dnl LBL_HI(reg,gmp_symbol)
62+
dnl LBL_LO(reg,gmp_symbol)
63+
dnl
64+
dnl Load the label of gmp_symbol into a register. We split this into
65+
dnl three parts to allow separation for manual insn scheduling.
66+
dnl LOH_ADRPADD has to be called as well with their respective labels
67+
dnl (*not* gmp_symbol), but remains unused on non-Darwin systems.
68+
69+
define(`LBL_HI', `adrp $1, $2')dnl
70+
define(`LBL_LO', `add $1, $1, :lo12:$2')dnl
71+
5372
divert`'dnl

src/mpn_extras/arm64/darwin.m4

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,29 @@ dnl don't want to disable macro expansions in or after them.
3737
changecom
3838

3939

40-
dnl LEA_HI(reg,gmp_symbol), LEA_LO(reg,gmp_symbol)
40+
dnl LEA_HI(reg,gmp_symbol)
41+
dnl LEA_LO(reg,gmp_symbol)
42+
dnl LOH_ADRPADD(labelhi,labello)
4143
dnl
42-
dnl Load the address of gmp_symbol into a register. We split this into two
43-
dnl parts to allow separation for manual insn scheduling. TODO: Darwin allows
44-
dnl for relaxing these two insns into an adr and a nop, but that requires the
45-
dnl .loh pseudo for connecting them.
44+
dnl Load the address of gmp_symbol into a register. label(hi|lo) has
45+
dnl to be unique labels that LEA_HI and LEA_LO are put on. We split
46+
dnl this into three parts to allow separation for manual insn
47+
dnl scheduling. LOH_ADRPADD has to be called as well.
4648

47-
define(`LEA_HI',`adrp $1, $2@GOTPAGE')dnl
48-
define(`LEA_LO',`ldr $1, [$1, $2@GOTPAGEOFF]')dnl
49+
define(`LEA_HI', `adrp $1, $2@GOTPAGE')dnl
50+
define(`LEA_LO', `add $1, $1, $2@GOTPAGEOFF')dnl
51+
define(`LOH_ADRPADD', `.loh AdrpAdd $1, $2')dnl
52+
53+
54+
dnl LBL_HI(reg,gmp_symbol)
55+
dnl LBL_LO(reg,gmp_symbol)
56+
dnl
57+
dnl Load the label of gmp_symbol into a register. We split this into
58+
dnl three parts to allow separation for manual insn scheduling.
59+
dnl LOH_ADRPADD has to be called as well with their respective labels
60+
dnl (*not* gmp_symbol).
61+
62+
define(`LBL_HI', `adrp $1, $2@PAGE')dnl
63+
define(`LBL_LO', `add $1, $1, $2@PAGEOFF')dnl
4964

5065
divert`'dnl

src/mpn_extras/arm64/mul_hard.asm

Lines changed: 96 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -519,102 +519,120 @@ EPILOGUE()
519519
PROLOGUE(flint_mpn_mul_8n)
520520
ldr r8, [bp], #1*8
521521
ldp r0, r1, [ap]
522-
ldp r2, r3, [ap,#2*8]
523-
ldp r4, r5, [ap,#4*8]
524-
ldp r6, r7, [ap,#6*8]
525-
526-
stp r14, r15, [sp, #-4*8]!
522+
ldp r2, r3, [ap, #2*8]
523+
stp r14, r15, [sp, #-6*8]!
524+
ldp r4, r5, [ap, #4*8]
525+
ldp r6, r7, [ap, #6*8]
527526
stp r16, r17, [sp, #2*8]
527+
stp r18, r19, [sp, #4*8]
528528

529-
L(m8): mul r9, r0, r8 C a0 b0
529+
L(m8): mul r9, r0, r8
530530
umulh r10, r0, r8
531-
mul r11, r1, r8 C a1 b0
532-
umulh r14, r1, r8
533-
mul ap, r2, r8 C a2 b0
534-
umulh r15, r2, r8
535-
mul r12, r3, r8 C a3 b0
531+
mul r11, r1, r8
532+
umulh r12, r1, r8
533+
mul r13, r2, r8
534+
umulh r14, r2, r8
535+
mul r15, r3, r8
536536
umulh r16, r3, r8
537-
mul r13, r4, r8 C a4 b0
538-
umulh r17, r4, r8
537+
mul r17, r4, r8
538+
umulh r18, r4, r8
539+
mul r19, r5, r8
540+
umulh ap, r5, r8
541+
C 9, (10, 11), (12, 13), (14, 15), (16, 17), (18, 19), ap
542+
539543
str r9, [rp], #1*8
540544
sub n, n, #1
541545
adds r10, r10, r11
542-
adcs r14, r14, ap
543-
adcs r15, r15, r12
544-
adcs r16, r16, r13
545-
mul r9, r5, r8 C a5 b0
546-
umulh r11, r5, r8
547-
mul ap, r6, r8 C a6 b0
548-
umulh r12, r6, r8
549-
mul r13, r7, r8 C a7 b0
550-
umulh r8, r7, r8
551-
adcs r17, r17, r9
552-
adcs r11, r11, ap
546+
547+
mul r9, r6, r8
548+
umulh r11, r6, r8
549+
553550
adcs r12, r12, r13
554-
cinc r9, r8, cs
555-
C 10, 14, 15, 16, 17, 11, 12, 9
551+
adcs r14, r14, r15
552+
553+
mul r13, r7, r8
554+
umulh r15, r7, r8
555+
C 10, 12, 14, (16, 17), (18, 19), (ap, 9), (11, 13), 15
556556

557557
cbz n, L(f8)
558558

559-
stp r18, r19, [sp, #-8*8]!
560-
stp r20, r21, [sp, #2*8]
561-
stp r22, r23, [sp, #4*8]
562-
str r24, [sp, #6*8]
559+
stp r20, r21, [sp, #-6*8]!
560+
stp r22, r23, [sp, #2*8]
561+
str r24, [sp, #4*8]
562+
563563
L(a8): ldr r8, [bp], #1*8
564-
mul r13, r0, r8 C a0 b0
565-
umulh ap, r0, r8
566-
mul r18, r1, r8 C a1 b0
567-
umulh r19, r1, r8
568-
mul r20, r2, r8 C a2 b0
569-
umulh r21, r2, r8
570-
mul r22, r3, r8 C a3 b0
571-
umulh r23, r3, r8
572-
mul r24, r4, r8 C a4 b0
573-
adds ap, ap, r18
574-
adcs r19, r19, r20
575-
adcs r21, r21, r22
576-
adcs r23, r23, r24
564+
adcs r16, r16, r17
565+
adcs r18, r18, r19
566+
adcs ap, ap, r9
567+
adcs r11, r11, r13
568+
cinc r15, r15, cs
569+
C 10, 12, 14, 16, 18, ap, 11, 15
570+
C Free: 20, 21, 22, 23, 24, 17, 19, 9, 13
571+
572+
mul r20, r0, r8
573+
mul r21, r1, r8
574+
mul r22, r2, r8
575+
mul r23, r3, r8
576+
mul r17, r4, r8
577+
mul r19, r5, r8
578+
mul r9, r6, r8
579+
mul r13, r7, r8
580+
581+
adds r20, r20, r10
582+
umulh r24, r7, r8
583+
584+
C (10, 20), (12, 21), (14, 22), (16, 23), (18, 17), (ap, 19), (11, 9), (15, 13), 24
585+
586+
adcs r21, r21, r12
587+
umulh r10, r0, r8
588+
589+
adcs r22, r22, r14
590+
umulh r12, r1, r8
591+
592+
adcs r23, r23, r16
593+
umulh r14, r2, r8
594+
595+
adcs r17, r17, r18
596+
umulh r16, r3, r8
597+
598+
adcs r19, r19, ap
577599
umulh r18, r4, r8
578-
mul r20, r5, r8 C a5 b0
579-
umulh r22, r5, r8
580-
mul r24, r6, r8 C a6 b0
581-
sub n, n, #1
582-
adcs r18, r18, r20
583-
adcs r22, r22, r24
584-
umulh r20, r6, r8
585-
mul r24, r7, r8 C a7 b0
586-
umulh r8, r7, r8
587-
adcs r20, r20, r24
588-
cinc r8, r8, cs
589-
C 13, ap, 19, 21, 23, 18, 22, 20, 8
590-
591-
C (13, (10, 14, 15, 16, 17, 11, 12, 9))
592-
C <- (10, 14, 15, 16, 17, 11, 12, 9) + (13, ap, 19, 21, 23, 18, 22, 20, 8)
593-
adds r13, r10, r13
594-
adcs r10, r14, ap
595-
adcs r14, r15, r19
596-
adcs r15, r16, r21
597-
adcs r16, r17, r23
598-
adcs r17, r11, r18
599-
adcs r11, r12, r22
600-
adcs r12, r9, r20
601-
cinc r9, r8, cs
602600

603-
str r13, [rp], #1*8
601+
adcs r9, r9, r11
602+
umulh ap, r5, r8
603+
604+
adcs r13, r13, r15
605+
umulh r11, r6, r8
606+
607+
str r20, [rp], #1*8
608+
609+
cinc r15, r24, cs
604610

611+
adds r10, r10, r21
612+
adcs r12, r12, r22
613+
adcs r14, r14, r23
614+
C 10, 12, 14, (16, 17), (18, 19), (ap, 9), (11, 13), 15
615+
616+
sub n, n, #1
605617
cbnz n, L(a8)
606618

607-
ldp r20, r21, [sp, #2*8]
608-
ldp r22, r23, [sp, #4*8]
609-
ldr r24, [sp, #6*8]
610-
ldp r18, r19, [sp], #8*8
611-
L(f8): stp r10, r14, [rp]
612-
stp r15, r16, [rp,#2*8]
613-
stp r17, r11, [rp,#4*8]
614-
stp r12, r9, [rp,#6*8]
619+
ldp r22, r23, [sp, #2*8]
620+
ldr r24, [sp, #4*8]
621+
ldp r20, r21, [sp], #6*8
622+
L(f8): adcs r16, r16, r17
623+
adcs r18, r18, r19
624+
stp r10, r12, [rp]
625+
adcs ap, ap, r9
626+
adcs r11, r11, r13
627+
stp r14, r16, [rp, #2*8]
628+
cinc r15, r15, cs
615629
ldp r16, r17, [sp, #2*8]
616-
ldp r14, r15, [sp], #4*8
617-
mov res, r9
630+
stp r18, ap, [rp, #4*8]
631+
ldp r18, r19, [sp, #4*8]
632+
stp r11, r15, [rp, #6*8]
633+
mov res, r15
634+
ldp r14, r15, [sp], #6*8
635+
618636
ret
619637
EPILOGUE()
620638

0 commit comments

Comments
 (0)