diff --git a/src/mpn_extras.h b/src/mpn_extras.h
index 1d8420ccb1..28d57af8c8 100644
--- a/src/mpn_extras.h
+++ b/src/mpn_extras.h
@@ -247,11 +247,11 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
# define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp))
#elif FLINT_HAVE_ASSEMBLY_armv8
# define FLINT_MPN_MUL_FUNC_N_TAB_WIDTH 15
-# define FLINT_MPN_SQR_FUNC_TAB_WIDTH 0
+# define FLINT_MPN_SQR_FUNC_TAB_WIDTH 9
# define FLINT_HAVE_MUL_FUNC(n, m) FLINT_HAVE_MUL_N_FUNC(n)
# define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= FLINT_MPN_MUL_FUNC_N_TAB_WIDTH)
-# define FLINT_HAVE_SQR_FUNC(n) (0)
+# define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH)
# define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_n_tab[xn](rp, xp, yp, yn))
# define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_func_n_tab[n](rp, xp, yp, n))
@@ -417,6 +417,19 @@ FLINT_DLL extern const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_nor
# define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 9
# define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 8
# define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 9
+
+/* NOTE: This function only works for n >= 6 */
+# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1
+
+/* NOTE: The x86_64_adx versions of these functions only works for n >= 6 */
+# define FLINT_HAVE_NATIVE_mpn_sqrhigh_basecase 1
+#elif FLINT_HAVE_ASSEMBLY_armv8
+# define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 8
+# define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 8
+# define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 0
+
+/* NOTE: This function only works for n > 8 */
+# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1
#else
# define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 16
# define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 2
@@ -429,8 +442,6 @@ FLINT_DLL extern const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_nor
void _flint_mpn_mulhigh_n_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n);
-/* NOTE: The x86_64_adx version of this function only works for n >= 6 */
-# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1
mp_limb_t _flint_mpn_mulhigh_basecase(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n);
mp_limb_t _flint_mpn_mulhigh_n_mulders(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n);
@@ -468,9 +479,6 @@ void flint_mpn_mul_or_mulhigh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t
#define FLINT_MPN_SQRHIGH_SQR_CUTOFF 2000
#define FLINT_MPN_SQRHIGH_K_TAB_SIZE 2048
-/* NOTE: The x86_64_adx versions of these functions only works for n >= 6 */
-# define FLINT_HAVE_NATIVE_mpn_sqrhigh_basecase 1
-
#if FLINT_HAVE_ASSEMBLY_x86_64_adx
mp_limb_t _flint_mpn_sqrhigh_basecase_even(mp_ptr, mp_srcptr, mp_size_t);
@@ -499,8 +507,6 @@ MPN_EXTRAS_INLINE mp_limb_t _flint_mpn_sqrhigh_basecase(mp_ptr res, mp_srcptr u,
void _flint_mpn_sqrhigh_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_size_t n);
mp_limb_t _flint_mpn_sqrhigh_mulders(mp_ptr res, mp_srcptr u, mp_size_t n);
-mp_limb_t _flint_mpn_sqrhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_size_t n);
-mp_limb_t flint_mpn_sqrhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_size_t n);
mp_limb_t _flint_mpn_sqrhigh_sqr(mp_ptr res, mp_srcptr u, mp_size_t n);
mp_limb_t _flint_mpn_sqrhigh(mp_ptr, mp_srcptr, mp_size_t);
diff --git a/src/mpn_extras/arm64/arm64-defs.m4 b/src/mpn_extras/arm64/arm64-defs.m4
index 46149f7bfd..112b60e550 100644
--- a/src/mpn_extras/arm64/arm64-defs.m4
+++ b/src/mpn_extras/arm64/arm64-defs.m4
@@ -37,10 +37,15 @@ dnl don't want to disable macro expansions in or after them.
changecom
-dnl LEA_HI(reg,gmp_symbol), LEA_LO(reg,gmp_symbol)
+dnl LEA_HI(reg,gmp_symbol)
+dnl LEA_LO(reg,gmp_symbol)
+dnl LOH_ADRPADD(labelhi,labello)
dnl
-dnl Load the address of gmp_symbol into a register. We split this into two
-dnl parts to allow separation for manual insn scheduling.
+dnl Load the address of gmp_symbol into a register. label(hi|lo) has to
+dnl be unique labels that LEA_HI and LEA_LO are put on. We split this
+dnl into three parts to allow separation for manual insn scheduling.
+dnl LOH_ADRPADD has to be called as well, but remains unused on
+dnl non-Darwin systems.
ifdef(`PIC',`dnl
define(`LEA_HI', `adrp $1, :got:$2')dnl
@@ -50,4 +55,18 @@ define(`LEA_HI', `adrp $1, $2')dnl
define(`LEA_LO', `add $1, $1, :lo12:$2')dnl
')dnl
+define(`LOH_ADRPADD',`')dnl
+
+
+dnl LBL_HI(reg,gmp_symbol)
+dnl LBL_LO(reg,gmp_symbol)
+dnl
+dnl Load the label of gmp_symbol into a register. We split this into
+dnl three parts to allow separation for manual insn scheduling.
+dnl LOH_ADRPADD has to be called as well with their respective labels
+dnl (*not* gmp_symbol), but remains unused on non-Darwin systems.
+
+define(`LBL_HI', `adrp $1, $2')dnl
+define(`LBL_LO', `add $1, $1, :lo12:$2')dnl
+
divert`'dnl
diff --git a/src/mpn_extras/arm64/darwin.m4 b/src/mpn_extras/arm64/darwin.m4
index 36e72fe45e..296f18dbb3 100644
--- a/src/mpn_extras/arm64/darwin.m4
+++ b/src/mpn_extras/arm64/darwin.m4
@@ -37,14 +37,29 @@ dnl don't want to disable macro expansions in or after them.
changecom
-dnl LEA_HI(reg,gmp_symbol), LEA_LO(reg,gmp_symbol)
+dnl LEA_HI(reg,gmp_symbol)
+dnl LEA_LO(reg,gmp_symbol)
+dnl LOH_ADRPADD(labelhi,labello)
dnl
-dnl Load the address of gmp_symbol into a register. We split this into two
-dnl parts to allow separation for manual insn scheduling. TODO: Darwin allows
-dnl for relaxing these two insns into an adr and a nop, but that requires the
-dnl .loh pseudo for connecting them.
+dnl Load the address of gmp_symbol into a register. label(hi|lo) has
+dnl to be unique labels that LEA_HI and LEA_LO are put on. We split
+dnl this into three parts to allow separation for manual insn
+dnl scheduling. LOH_ADRPADD has to be called as well.
-define(`LEA_HI',`adrp $1, $2@GOTPAGE')dnl
-define(`LEA_LO',`ldr $1, [$1, $2@GOTPAGEOFF]')dnl
+define(`LEA_HI', `adrp $1, $2@GOTPAGE')dnl
+define(`LEA_LO', `add $1, $1, $2@GOTPAGEOFF')dnl
+define(`LOH_ADRPADD', `.loh AdrpAdd $1, $2')dnl
+
+
+dnl LBL_HI(reg,gmp_symbol)
+dnl LBL_LO(reg,gmp_symbol)
+dnl
+dnl Load the label of gmp_symbol into a register. We split this into
+dnl three parts to allow separation for manual insn scheduling.
+dnl LOH_ADRPADD has to be called as well with their respective labels
+dnl (*not* gmp_symbol).
+
+define(`LBL_HI', `adrp $1, $2@PAGE')dnl
+define(`LBL_LO', `add $1, $1, $2@PAGEOFF')dnl
divert`'dnl
diff --git a/src/mpn_extras/arm64/mul_hard.asm b/src/mpn_extras/arm64/mul_hard.asm
index f6a9ccb94f..dbd08287d3 100644
--- a/src/mpn_extras/arm64/mul_hard.asm
+++ b/src/mpn_extras/arm64/mul_hard.asm
@@ -519,102 +519,120 @@ EPILOGUE()
PROLOGUE(flint_mpn_mul_8n)
ldr r8, [bp], #1*8
ldp r0, r1, [ap]
- ldp r2, r3, [ap,#2*8]
- ldp r4, r5, [ap,#4*8]
- ldp r6, r7, [ap,#6*8]
-
- stp r14, r15, [sp, #-4*8]!
+ ldp r2, r3, [ap, #2*8]
+ stp r14, r15, [sp, #-6*8]!
+ ldp r4, r5, [ap, #4*8]
+ ldp r6, r7, [ap, #6*8]
stp r16, r17, [sp, #2*8]
+ stp r18, r19, [sp, #4*8]
-L(m8): mul r9, r0, r8 C a0 b0
+L(m8): mul r9, r0, r8
umulh r10, r0, r8
- mul r11, r1, r8 C a1 b0
- umulh r14, r1, r8
- mul ap, r2, r8 C a2 b0
- umulh r15, r2, r8
- mul r12, r3, r8 C a3 b0
+ mul r11, r1, r8
+ umulh r12, r1, r8
+ mul r13, r2, r8
+ umulh r14, r2, r8
+ mul r15, r3, r8
umulh r16, r3, r8
- mul r13, r4, r8 C a4 b0
- umulh r17, r4, r8
+ mul r17, r4, r8
+ umulh r18, r4, r8
+ mul r19, r5, r8
+ umulh ap, r5, r8
+ C 9, (10, 11), (12, 13), (14, 15), (16, 17), (18, 19), ap
+
str r9, [rp], #1*8
sub n, n, #1
adds r10, r10, r11
- adcs r14, r14, ap
- adcs r15, r15, r12
- adcs r16, r16, r13
- mul r9, r5, r8 C a5 b0
- umulh r11, r5, r8
- mul ap, r6, r8 C a6 b0
- umulh r12, r6, r8
- mul r13, r7, r8 C a7 b0
- umulh r8, r7, r8
- adcs r17, r17, r9
- adcs r11, r11, ap
+
+ mul r9, r6, r8
+ umulh r11, r6, r8
+
adcs r12, r12, r13
- cinc r9, r8, cs
- C 10, 14, 15, 16, 17, 11, 12, 9
+ adcs r14, r14, r15
+
+ mul r13, r7, r8
+ umulh r15, r7, r8
+ C 10, 12, 14, (16, 17), (18, 19), (ap, 9), (11, 13), 15
cbz n, L(f8)
- stp r18, r19, [sp, #-8*8]!
- stp r20, r21, [sp, #2*8]
- stp r22, r23, [sp, #4*8]
- str r24, [sp, #6*8]
+ stp r20, r21, [sp, #-6*8]!
+ stp r22, r23, [sp, #2*8]
+ str r24, [sp, #4*8]
+
L(a8): ldr r8, [bp], #1*8
- mul r13, r0, r8 C a0 b0
- umulh ap, r0, r8
- mul r18, r1, r8 C a1 b0
- umulh r19, r1, r8
- mul r20, r2, r8 C a2 b0
- umulh r21, r2, r8
- mul r22, r3, r8 C a3 b0
- umulh r23, r3, r8
- mul r24, r4, r8 C a4 b0
- adds ap, ap, r18
- adcs r19, r19, r20
- adcs r21, r21, r22
- adcs r23, r23, r24
+ adcs r16, r16, r17
+ adcs r18, r18, r19
+ adcs ap, ap, r9
+ adcs r11, r11, r13
+ cinc r15, r15, cs
+ C 10, 12, 14, 16, 18, ap, 11, 15
+ C Free: 20, 21, 22, 23, 24, 17, 19, 9, 13
+
+ mul r20, r0, r8
+ mul r21, r1, r8
+ mul r22, r2, r8
+ mul r23, r3, r8
+ mul r17, r4, r8
+ mul r19, r5, r8
+ mul r9, r6, r8
+ mul r13, r7, r8
+
+ adds r20, r20, r10
+ umulh r24, r7, r8
+
+ C (10, 20), (12, 21), (14, 22), (16, 23), (18, 17), (ap, 19), (11, 9), (15, 13), 24
+
+ adcs r21, r21, r12
+ umulh r10, r0, r8
+
+ adcs r22, r22, r14
+ umulh r12, r1, r8
+
+ adcs r23, r23, r16
+ umulh r14, r2, r8
+
+ adcs r17, r17, r18
+ umulh r16, r3, r8
+
+ adcs r19, r19, ap
umulh r18, r4, r8
- mul r20, r5, r8 C a5 b0
- umulh r22, r5, r8
- mul r24, r6, r8 C a6 b0
- sub n, n, #1
- adcs r18, r18, r20
- adcs r22, r22, r24
- umulh r20, r6, r8
- mul r24, r7, r8 C a7 b0
- umulh r8, r7, r8
- adcs r20, r20, r24
- cinc r8, r8, cs
- C 13, ap, 19, 21, 23, 18, 22, 20, 8
-
- C (13, (10, 14, 15, 16, 17, 11, 12, 9))
- C <- (10, 14, 15, 16, 17, 11, 12, 9) + (13, ap, 19, 21, 23, 18, 22, 20, 8)
- adds r13, r10, r13
- adcs r10, r14, ap
- adcs r14, r15, r19
- adcs r15, r16, r21
- adcs r16, r17, r23
- adcs r17, r11, r18
- adcs r11, r12, r22
- adcs r12, r9, r20
- cinc r9, r8, cs
- str r13, [rp], #1*8
+ adcs r9, r9, r11
+ umulh ap, r5, r8
+
+ adcs r13, r13, r15
+ umulh r11, r6, r8
+
+ str r20, [rp], #1*8
+
+ cinc r15, r24, cs
+ adds r10, r10, r21
+ adcs r12, r12, r22
+ adcs r14, r14, r23
+ C 10, 12, 14, (16, 17), (18, 19), (ap, 9), (11, 13), 15
+
+ sub n, n, #1
cbnz n, L(a8)
- ldp r20, r21, [sp, #2*8]
- ldp r22, r23, [sp, #4*8]
- ldr r24, [sp, #6*8]
- ldp r18, r19, [sp], #8*8
-L(f8): stp r10, r14, [rp]
- stp r15, r16, [rp,#2*8]
- stp r17, r11, [rp,#4*8]
- stp r12, r9, [rp,#6*8]
+ ldp r22, r23, [sp, #2*8]
+ ldr r24, [sp, #4*8]
+ ldp r20, r21, [sp], #6*8
+L(f8): adcs r16, r16, r17
+ adcs r18, r18, r19
+ stp r10, r12, [rp]
+ adcs ap, ap, r9
+ adcs r11, r11, r13
+ stp r14, r16, [rp, #2*8]
+ cinc r15, r15, cs
ldp r16, r17, [sp, #2*8]
- ldp r14, r15, [sp], #4*8
- mov res, r9
+ stp r18, ap, [rp, #4*8]
+ ldp r18, r19, [sp, #4*8]
+ stp r11, r15, [rp, #6*8]
+ mov res, r15
+ ldp r14, r15, [sp], #6*8
+
ret
EPILOGUE()
diff --git a/src/mpn_extras/arm64/mulhigh_basecase.asm b/src/mpn_extras/arm64/mulhigh_basecase.asm
new file mode 100644
index 0000000000..3fbd446b5b
--- /dev/null
+++ b/src/mpn_extras/arm64/mulhigh_basecase.asm
@@ -0,0 +1,419 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl Uses code adopted from the GNU MP Library.
+dnl
+dnl Copyright 2020 Free Software Foundation, Inc.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version. See .
+dnl
+
+include(`config.m4')
+
+define(`rp', `x0')
+define(`ap', `x1')
+define(`bp', `x2')
+define(`n', `x3')
+
+define(`r0', `x4')
+define(`r1', `x5')
+define(`r2', `x6')
+define(`r3', `x7')
+define(`r4', `x8')
+define(`r5', `x9')
+define(`r6', `x10')
+define(`r7', `x11')
+define(`r8', `x12')
+define(`r9', `x13')
+define(`r10', `x14')
+define(`r11', `x15')
+define(`r12', `x16')
+define(`r13', `x17')
+
+dnl NOTE: The following has to be pushed and popped
+dnl NOTE: At least on Apple, we need to preserve both x18 and x29 at all times!
+define(`r14', `x19')
+define(`r15', `x20')
+define(`r16', `x21')
+define(`r17', `x22')
+define(`r18', `x23')
+define(`r19', `x24')
+define(`r20', `x25')
+define(`r21', `x26')
+define(`r22', `x27')
+define(`r23', `x28')
+define(`r24', `x30')
+
+define(`res', `rp') dnl NOTE: Synonymous with rp!
+
+dnl Idea is to compute the first 9 by 8 block, then continue via addmul chains.
+
+dnl n = 10:
+dnl
+dnl 0 1 2 3 4 5 6 7 8 9
+dnl 0 _ _ _ _ _ _ _ h x
+dnl 1 h¦x x
+dnl 2 h x¦x x
+dnl 3 h x x¦x x
+dnl 4 h x x x¦x x
+dnl 5 h x x x x¦x x
+dnl 6 h x x x x x¦x x
+dnl 7 h x x x x x x¦x x
+dnl 8 h x x x x x x x¦x x
+dnl 9 x x x x x x x x¦x x
+
+PROLOGUE(_flint_mpn_mulhigh_basecase)
+ add ap, ap, n, lsl #3
+ sub ap, ap, #9*8
+ stp r14, r15, [sp, #-10*8]!
+ ldp r9, r10, [bp]
+ ldp r6, r7, [ap, #6*8]
+ ldr r8, [ap, #8*8]
+
+ ldp r11, r12, [bp, #2*8]
+ stp r16, r17, [sp, #2*8]
+ ldp r13, r14, [bp, #4*8]
+ stp r18, r19, [sp, #4*8]
+
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+
+ stp r20, r21, [sp, #6*8]
+ ldp r15, r16, [bp, #6*8]
+ stp r22, r23, [sp, #8*8]
+
+ C ap: 0, 1, ..., 8
+ C bp: 9, ..., 16
+ C Free: 17, ..., 23
+
+ umulh r17, r7, r9
+ mul r18, r8, r9
+ umulh r9, r8, r9
+ umulh r19, r7, r10
+ umulh r20, r8, r10
+ umulh r21, r6, r10
+ mul r22, r7, r10
+ mul r10, r8, r10
+ C (17, 18, 21, 22), (9, 19, 10), 20
+ C Free: 23
+
+ umulh r23, r5, r11
+ adds r17, r17, r18
+ umulh r18, r6, r11
+ adcs r9, r9, r19
+ umulh r19, r7, r11
+ cinc r20, r20, cs
+ adds r17, r17, r21
+ umulh r21, r8, r11
+ adcs r9, r9, r10
+ mul r10, r6, r11
+ cinc r20, r20, cs
+ adds r17, r17, r22
+ mul r22, r7, r11
+ mul r11, r8, r11
+
+ adcs r9, r9, r18
+ umulh r18, r4, r12
+ adcs r20, r20, r19
+ umulh r19, r5, r12
+ cinc r21, r21, cs
+ adds r17, r17, r23
+ umulh r23, r6, r12
+ adcs r9, r9, r22
+ umulh r22, r7, r12
+ adcs r20, r20, r11
+ umulh r11, r8, r12
+ cinc r21, r21, cs
+ adds r17, r17, r10
+ mul r10, r5, r12
+ adcs r9, r9, r19
+ mul r19, r6, r12
+ adcs r20, r20, r23
+ mul r23, r7, r12
+ mul r12, r8, r12
+
+ adcs r21, r21, r22
+ umulh r22, r3, r13
+ cinc r11, r11, cs
+ adds r17, r17, r18
+ umulh r18, r4, r13
+ adcs r9, r9, r19
+ umulh r19, r5, r13
+ adcs r20, r20, r23
+ umulh r23, r6, r13
+ adcs r21, r21, r12
+ umulh r12, r7, r13
+ cinc r11, r11, cs
+ adds r17, r17, r10
+ umulh r10, r8, r13
+ adcs r9, r9, r18
+ mul r18, r4, r13
+ adcs r20, r20, r19
+ mul r19, r5, r13
+ adcs r21, r21, r23
+ mul r23, r6, r13
+ adcs r11, r11, r12
+ mul r12, r7, r13
+ mul r13, r8, r13
+
+ cinc r10, r10, cs
+ adds r17, r17, r22
+ umulh r22, r2, r14
+ adcs r9, r9, r19
+ umulh r19, r3, r14
+ adcs r20, r20, r23
+ umulh r23, r4, r14
+ adcs r21, r21, r12
+ umulh r12, r5, r14
+ adcs r11, r11, r13
+ umulh r13, r6, r14
+ cinc r10, r10, cs
+ adds r17, r17, r18
+ umulh r18, r7, r14
+ adcs r9, r9, r19
+ umulh r19, r8, r14
+ adcs r20, r20, r23
+ mul r23, r3, r14
+ adcs r21, r21, r12
+ mul r12, r4, r14
+ adcs r11, r11, r13
+ mul r13, r5, r14
+ adcs r10, r10, r18
+ mul r18, r6, r14
+ cinc r19, r19, cs
+ adds r17, r17, r22
+ mul r22, r7, r14
+ mul r14, r8, r14
+
+ adcs r9, r9, r12
+ umulh r12, r1, r15
+ adcs r20, r20, r13
+ umulh r13, r2, r15
+ adcs r21, r21, r18
+ umulh r18, r3, r15
+ adcs r11, r11, r22
+ umulh r22, r4, r15
+ adcs r10, r10, r14
+ umulh r14, r5, r15
+ cinc r19, r19, cs
+ adds r17, r17, r23
+ umulh r23, r6, r15
+ adcs r9, r9, r13
+ umulh r13, r7, r15
+ adcs r20, r20, r18
+ umulh r18, r8, r15
+ adcs r21, r21, r22
+ mul r22, r2, r15
+ adcs r11, r11, r14
+ mul r14, r3, r15
+ adcs r10, r10, r23
+ mul r23, r4, r15
+ adcs r19, r19, r13
+ mul r13, r5, r15
+ cinc r18, r18, cs
+ adds r17, r17, r12
+ mul r12, r6, r15
+ adcs r9, r9, r14
+ mul r14, r7, r15
+ mul r15, r8, r15
+
+ adcs r20, r20, r23
+ umulh r23, r1, r16
+ adcs r21, r21, r13
+ umulh r13, r2, r16
+ adcs r11, r11, r12
+ umulh r12, r3, r16
+ adcs r10, r10, r14
+ umulh r14, r4, r16
+ adcs r19, r19, r15
+ umulh r15, r5, r16
+ cinc r18, r18, cs
+ adds r17, r17, r22
+ umulh r22, r6, r16
+ adcs r9, r9, r23
+ umulh r23, r7, r16
+ adcs r20, r20, r13
+ umulh r13, r8, r16
+ mul r1, r1, r16
+ adcs r21, r21, r12
+ mul r2, r2, r16
+ adcs r11, r11, r14
+ mul r3, r3, r16
+ adcs r10, r10, r15
+ mul r4, r4, r16
+ adcs r19, r19, r22
+ mul r5, r5, r16
+ adcs r18, r18, r23
+ mul r6, r6, r16
+ cinc r13, r13, cs
+ mul r7, r7, r16
+ adds r1, r17, r1 C Store lowest limb (return value) in r1
+ mul r8, r8, r16
+ adcs r9, r9, r2
+ umulh r2, r0, r16 C Save r2 for next addmul
+ adcs r20, r20, r3
+ stp r9, r20, [rp]
+ adcs r21, r21, r4
+ adcs r11, r11, r5
+ stp r21, r11, [rp, #2*8]
+ adcs r10, r10, r6
+ adcs r19, r19, r7
+ stp r10, r19, [rp, #4*8]
+ adcs r18, r18, r8
+ cinc r13, r13, cs
+ stp r18, r13, [rp, #6*8]
+
+define(`b0', `r0') C loaded bp value
+define(`rx', `r1') C return value
+define(`cy', `r2') C carry for next iteration in loop
+
+define(`ix', `r3') C iterator
+define(`ixsave',`r4') C saved iterator
+define(`ns', `r5') C resetter for ap and rp
+
+define(`jmp', `r6') C jump values
+dnl 7 and 20, ..., 23 will remain unused for the rest of the program
+
+L(hi): LBL_HI( jmp, L(p1h))
+ mov ixsave, #2
+ mov ix, #2
+ ldp r20, r21, [sp, #6*8] C No longer in use
+ ldp r22, r23, [sp, #8*8]
+L(lo): LBL_LO( jmp, L(p1h))
+ LOH_ADRPADD(L(hi), L(lo))
+ ldr b0, [bp, #8*8]!
+ mov ns, #8*8
+ sub n, n, #8
+
+L(p1h): ldr r8, [ap], #1*8
+ mul r12, r8, b0
+ umulh r8, r8, b0
+ add jmp, jmp, #9*4 C jmp goes from p1h to p2h
+ adds rx, rx, cy
+ cinc r8, r8, cs
+ adds rx, rx, r12
+ cinc cy, r8, cs
+ b L(top)
+
+L(p2h): ldp r8, r9, [ap], #2*8
+ ldr r17, [rp]
+ mul r12, r8, b0
+ umulh r8, r8, b0
+ mul r13, r9, b0
+ umulh r9, r9, b0
+ add jmp, jmp, #15*4 C jmp goes from p2h to p3h
+ adds rx, rx, cy
+ adcs r8, r17, r8
+ cinc r9, r9, cs
+ adds rx, rx, r12
+ adcs r8, r8, r13
+ cinc cy, r9, cs
+ str r8, [rp], #1*8
+ b L(top)
+
+L(p3h): ldp r8, r9, [ap], #2*8
+ ldr r10, [ap], #1*8
+ ldp r17, r18, [rp]
+ mul r12, r8, b0
+ umulh r8, r8, b0
+ mul r13, r9, b0
+ umulh r9, r9, b0
+ mul r14, r10, b0
+ umulh r10, r10, b0
+ add jmp, jmp, #20*4 C jmp goes from p3h to p0h
+ adds rx, rx, cy
+ adcs r8, r17, r8
+ adcs r9, r18, r9
+ cinc r10, r10, cs
+ adds rx, rx, r12
+ adcs r8, r8, r13
+ adcs r9, r9, r14
+ cinc cy, r10, cs
+ stp r8, r9, [rp], #2*8
+ b L(top)
+
+L(p0h): ldp r8, r9, [ap], #2*8
+ ldp r10, r11, [ap], #2*8
+ ldp r17, r18, [rp]
+ ldr r19, [rp, #2*8]
+ add ixsave, ixsave, #1
+ sub jmp, jmp, #44*4 C jmp goes from p0h to p1h
+ mul r12, r8, b0
+ umulh r8, r8, b0
+ mul r13, r9, b0
+ umulh r9, r9, b0
+ mul r14, r10, b0
+ umulh r10, r10, b0
+ mul r15, r11, b0
+ umulh r11, r11, b0
+ adds rx, rx, cy
+ adcs r8, r17, r8
+ adcs r9, r18, r9
+ adcs r10, r19, r10
+ cinc r11, r11, cs
+ adds rx, rx, r12
+ adcs r8, r8, r13
+ adcs r9, r9, r14
+ adcs r10, r10, r15
+ cinc cy, r11, cs
+ stp r8, r9, [rp], #2*8
+ str r10, [rp], #1*8
+
+L(top): ldp r8, r9, [ap], #2*8
+ ldp r10, r11, [ap], #2*8
+ ldp r16, r17, [rp]
+ ldp r18, r19, [rp, #2*8]
+ mul r12, r8, b0
+ umulh r8, r8, b0
+ mul r13, r9, b0
+ umulh r9, r9, b0
+ mul r14, r10, b0
+ umulh r10, r10, b0
+ mul r15, r11, b0
+ umulh r11, r11, b0
+ adds r12, r16, r12
+ adcs r8, r17, r8
+ adcs r9, r18, r9
+ adcs r10, r19, r10
+ cinc r11, r11, cs
+ adds r12, r12, cy
+ adcs r8, r8, r13
+ adcs r9, r9, r14
+ adcs r10, r10, r15
+ cinc cy, r11, cs
+ stp r12, r8, [rp], #2*8
+ stp r9, r10, [rp], #2*8
+ sub ix, ix, #1
+ cbnz ix, L(top)
+
+L(end): str cy, [rp]
+
+ sub n, n, #1
+ cbz n, L(fin)
+
+ sub rp, rp, ns C Reset rp
+ sub ap, ap, ns C Reset ap, part 1
+ add ns, ns, #1*8 C Increase reseter
+
+ ldr cy, [ap, #-2*8]! C Reset ap, part 2
+ umulh cy, cy, b0
+
+ mov ix, ixsave
+ ldr b0, [bp, #1*8]!
+ br jmp
+
+L(fin): ldp r16, r17, [sp, #2*8]
+ ldp r18, r19, [sp, #4*8]
+ ldp r14, r15, [sp], #10*8
+
+ mov res, rx
+
+ ret
+EPILOGUE()
diff --git a/src/mpn_extras/arm64/mulhigh_hard.asm b/src/mpn_extras/arm64/mulhigh_hard.asm
new file mode 100644
index 0000000000..0f5ac250c4
--- /dev/null
+++ b/src/mpn_extras/arm64/mulhigh_hard.asm
@@ -0,0 +1,793 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version. See .
+dnl
+
+include(`config.m4')
+
+define(`rp', `x0')
+define(`ap', `x1')
+define(`bp', `x2')
+
+define(`r0', `x3')
+define(`r1', `x4')
+define(`r2', `x5')
+define(`r3', `x6')
+define(`r4', `x7')
+define(`r5', `x8')
+define(`r6', `x9')
+define(`r7', `x10')
+define(`r8', `x11')
+define(`r9', `x12')
+define(`r10', `x13')
+define(`r11', `x14')
+define(`r12', `x15')
+define(`r13', `x16')
+define(`r14', `x17')
+
+define(`r15', `ap') dnl Beware!
+define(`r16', `bp')
+
+dnl NOTE: The following has to be pushed and popped
+dnl NOTE: At least on Apple, we need to preserve both x18 and x29 at all times!
+define(`r17', `x19')
+define(`r18', `x20')
+define(`r19', `x21')
+define(`r20', `x22')
+define(`r21', `x23')
+define(`r22', `x24')
+define(`r23', `x25')
+define(`r24', `x26')
+define(`r25', `x27')
+define(`r26', `x28')
+define(`r27', `x30')
+
+define(`res', `rp') dnl NOTE: Synonymous with rp!
+
+PROLOGUE(flint_mpn_mulhigh_1)
+ ldr r0, [ap]
+ ldr r1, [bp]
+
+ umulh r3, r0, r1
+ mul r2, r0, r1
+
+ str r3, [rp]
+ mov res, r2
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_mulhigh_2)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [bp]
+
+ umulh r4, r0, r2
+
+ mul r5, r0, r3
+ umulh r6, r0, r3
+
+ mul r7, r1, r2
+ umulh r8, r1, r2
+
+ mul r9, r1, r3
+ umulh r10, r1, r3
+ C (4, 5, 7), (6, 8, 9), 10
+
+ adds r4, r4, r5
+ adcs r6, r6, r8
+ cinc r10, r10, cs
+ adds r4, r4, r7
+ adcs r6, r6, r9
+ cinc r10, r10, cs
+
+ stp r6, r10, [rp]
+ mov res, r4
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_mulhigh_3)
+ ldp r0, r1, [ap]
+ ldp r3, r4, [bp]
+ ldr r2, [ap, #2*8]
+ ldr r5, [bp, #2*8]
+
+ umulh r6, r1, r3
+ mul r7, r2, r3
+ umulh r3, r2, r3
+ umulh r8, r1, r4
+ umulh r9, r2, r4
+ umulh r10, r0, r4
+ mul r11, r2, r4
+ mul r12, r0, r5
+
+ adds r6, r6, r7
+
+ umulh r0, r0, r5
+ umulh r13, r1, r5
+
+ adcs r3, r3, r8
+
+ umulh r14, r2, r5
+ mul r4, r1, r4
+
+ cinc r9, r9, cs
+ adds r6, r6, r10
+
+ mul r1, r1, r5
+ mul r2, r2, r5
+
+ adcs r3, r3, r11
+ cinc r9, r9, cs
+
+ adds r6, r6, r12
+ adcs r3, r3, r0
+ adcs r9, r9, r13
+ cinc r14, r14, cs
+
+ adds r6, r6, r4
+ adcs r3, r3, r1
+ adcs r9, r9, r2
+ cinc r14, r14, cs
+
+ stp r3, r9, [rp]
+ str r14, [rp, #2*8]
+ mov res, r6
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_mulhigh_4)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [bp]
+ ldp r6, r7, [bp, #2*8]
+
+ umulh r8, r2, r4
+ mul r9, r3, r4
+ umulh r4, r3, r4
+ umulh r12, r2, r5
+ umulh r14, r3, r5
+ umulh r10, r1, r5
+ mul r11, r2, r5
+ mul r13, r3, r5
+ C (8, 9, 10, 11), (4, 12, 13), 14
+ C Free: 5, 15, 16
+
+ adds r8, r8, r9
+
+ umulh r9, r0, r6
+ umulh r5, r1, r6
+
+ adcs r4, r4, r12
+
+ umulh r12, r2, r6
+ umulh r15, r3, r6
+
+ cinc r14, r14, cs
+ adds r8, r8, r10
+
+ mul r16, r1, r6
+ mul r10, r2, r6
+
+ adcs r4, r4, r13
+ cinc r14, r14, cs
+ adds r8, r8, r11
+
+ mul r13, r3, r6
+ umulh r11, r0, r7
+
+ adcs r4, r4, r5
+ adcs r14, r14, r12
+
+ umulh r5, r1, r7
+ umulh r12, r2, r7
+
+ cinc r15, r15, cs
+ adds r8, r8, r9
+ adcs r4, r4, r10
+
+ umulh r9, r3, r7
+ mul r10, r0, r7
+
+ adcs r14, r14, r13
+ cinc r15, r15, cs
+ adds r8, r8, r16
+
+ mul r13, r1, r7
+ mul r16, r2, r7
+
+ adcs r4, r4, r11
+ adcs r14, r14, r5
+
+ mul r11, r3, r7
+
+ adcs r15, r15, r12
+ cinc r9, r9, cs
+ C (8, 10), (4, 13), (14, 16), (15, 11), 9
+
+ adds r8, r8, r10
+ adcs r4, r4, r13
+ adcs r14, r14, r16
+ adcs r15, r15, r11
+ cinc r9, r9, cs
+
+ stp r4, r14, [rp]
+ stp r15, r9, [rp, #2*8]
+ mov res, r8
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_mulhigh_5)
+ ldp r2, r3, [ap, #2*8]
+ ldr r4, [ap, #4*8]
+ ldp r5, r6, [bp]
+ ldp r0, r1, [ap]
+ ldp r7, r8, [bp, #2*8]
+ ldr r9, [bp, #4*8]
+
+ umulh r10, r3, r5
+ mul r11, r4, r5
+ umulh r5, r4, r5
+ umulh r12, r3, r6
+ umulh r13, r4, r6
+ umulh r14, r2, r6
+ mul r15, r3, r6
+ mul r6, r4, r6
+ C (10, 11, 14, 15), (5, 12, 6), 13
+ C Free: 16
+
+ adds r10, r10, r11
+ umulh r16, r1, r7
+ umulh r11, r2, r7
+ adcs r5, r5, r12
+ umulh r12, r3, r7
+ cinc r13, r13, cs
+ adds r10, r10, r14
+ umulh r14, r4, r7
+ adcs r5, r5, r6
+ mul r6, r2, r7
+ cinc r13, r13, cs
+ adds r10, r10, r15
+ mul r15, r3, r7
+ adcs r5, r5, r11
+ mul r7, r4, r7
+ umulh r11, r0, r8
+ adcs r13, r13, r12
+ umulh r12, r1, r8
+ cinc r14, r14, cs
+ adds r10, r10, r16
+ umulh r16, r2, r8
+ adcs r5, r5, r15
+ umulh r15, r3, r8
+ adcs r13, r13, r7
+ umulh r7, r4, r8
+ cinc r14, r14, cs
+ adds r10, r10, r6
+ mul r6, r1, r8
+ adcs r5, r5, r12
+ mul r12, r2, r8
+ adcs r13, r13, r16
+ mul r16, r3, r8
+ adcs r14, r14, r15
+ mul r8, r4, r8
+ umulh r15, r0, r9
+ cinc r7, r7, cs
+ adds r10, r10, r11
+ umulh r11, r1, r9
+ adcs r5, r5, r12
+ umulh r12, r2, r9
+ adcs r13, r13, r16
+ umulh r16, r3, r9
+ adcs r14, r14, r8
+ umulh r8, r4, r9
+ mul r0, r0, r9
+ cinc r7, r7, cs
+ mul r1, r1, r9
+ mul r2, r2, r9
+ mul r3, r3, r9
+ mul r4, r4, r9
+
+ adds r10, r10, r6
+ adcs r5, r5, r15
+ adcs r13, r13, r11
+ adcs r14, r14, r12
+ adcs r7, r7, r16
+ cinc r8, r8, cs
+ adds r10, r10, r0
+ adcs r5, r5, r1
+ adcs r13, r13, r2
+ adcs r14, r14, r3
+ stp r5, r13, [rp]
+ adcs r7, r7, r4
+ cinc r8, r8, cs
+ stp r14, r7, [rp, #2*8]
+ str r8, [rp, #4*8]
+ mov res, r10
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_mulhigh_6)
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+ ldp r6, r7, [bp]
+
+ ldp r0, r1, [ap]
+ ldp r8, r9, [bp, #2*8]
+ ldp r10, r11, [bp, #4*8]
+
+ stp r17, r18, [sp, #-2*8]!
+
+ umulh r12, r4, r6
+ mul r13, r5, r6
+ umulh r6, r5, r6
+ umulh r14, r4, r7
+ umulh r15, r5, r7
+ umulh r16, r3, r7
+ mul r17, r4, r7
+ mul r7, r5, r7
+ C (12, 13, 16, 17), (6, 14, 7), 15
+ C Free: 18
+
+ umulh r18, r2, r8
+ adds r12, r12, r13
+ umulh r13, r3, r8
+ adcs r6, r6, r14
+ umulh r14, r4, r8
+ cinc r15, r15, cs
+ adds r12, r12, r16
+ umulh r16, r5, r8
+ adcs r6, r6, r7
+ mul r7, r3, r8
+ cinc r15, r15, cs
+ adds r12, r12, r17
+ mul r17, r4, r8
+ mul r8, r5, r8
+ adcs r6, r6, r13
+ umulh r13, r1, r9
+ adcs r15, r15, r14
+ umulh r14, r2, r9
+ cinc r16, r16, cs
+ adds r12, r12, r18
+ umulh r18, r3, r9
+ adcs r6, r6, r17
+ umulh r17, r4, r9
+ adcs r15, r15, r8
+ umulh r8, r5, r9
+ cinc r16, r16, cs
+ adds r12, r12, r7
+ mul r7, r2, r9
+ adcs r6, r6, r14
+ mul r14, r3, r9
+ adcs r15, r15, r18
+ mul r18, r4, r9
+ mul r9, r5, r9
+ adcs r16, r16, r17
+ umulh r17, r0, r10
+ cinc r8, r8, cs
+ adds r12, r12, r13
+ umulh r13, r1, r10
+ adcs r6, r6, r14
+ umulh r14, r2, r10
+ adcs r15, r15, r18
+ umulh r18, r3, r10
+ adcs r16, r16, r9
+ umulh r9, r4, r10
+ cinc r8, r8, cs
+ adds r12, r12, r7
+ umulh r7, r5, r10
+ adcs r6, r6, r13
+ mul r13, r1, r10
+ adcs r15, r15, r14
+ mul r14, r2, r10
+ adcs r16, r16, r18
+ mul r18, r3, r10
+ adcs r8, r8, r9
+ mul r9, r4, r10
+ mul r10, r5, r10
+ cinc r7, r7, cs
+ adds r12, r12, r17
+ umulh r17, r0, r11
+ adcs r6, r6, r14
+ umulh r14, r1, r11
+ adcs r15, r15, r18
+ umulh r18, r2, r11
+ adcs r16, r16, r9
+ umulh r9, r3, r11
+ adcs r8, r8, r10
+ umulh r10, r4, r11
+ cinc r7, r7, cs
+ adds r12, r12, r13
+ umulh r13, r5, r11
+ mul r0, r0, r11
+ adcs r6, r6, r17
+ mul r1, r1, r11
+ mul r2, r2, r11
+ adcs r15, r15, r14
+ mul r3, r3, r11
+ mul r4, r4, r11
+ adcs r16, r16, r18
+ mul r5, r5, r11
+ adcs r8, r8, r9
+ adcs r7, r7, r10
+ cinc r13, r13, cs
+ ldp r17, r18, [sp], #2*8
+ adds r12, r12, r0
+ adcs r6, r6, r1
+ adcs r15, r15, r2
+ adcs r16, r16, r3
+ stp r6, r15, [rp]
+ adcs r8, r8, r4
+ adcs r7, r7, r5
+ stp r16, r8, [rp, #2*8]
+ cinc r13, r13, cs
+ stp r7, r13, [rp, #4*8]
+
+ mov res, r12
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_mulhigh_7)
+ ldp r4, r5, [ap, #4*8]
+ ldr r6, [ap, #6*8]
+ ldp r7, r8, [bp]
+ stp r17, r18, [sp, #-4*8]!
+
+ ldp r9, r10, [bp, #2*8]
+ ldp r11, r12, [bp, #4*8]
+ ldr r13, [bp, #6*8]
+ stp r19, r20, [sp, #2*8]
+
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+
+ umulh r14, r5, r7
+ mul r15, r6, r7
+ umulh r7, r6, r7
+ umulh r16, r5, r8
+ umulh r17, r6, r8
+ umulh r18, r4, r8
+ mul r19, r5, r8
+ mul r8, r6, r8
+ C (14, 15, 18, 19), (7, 16, 8), 17
+ C Free: 20
+
+ umulh r20, r3, r9
+ adds r14, r14, r15
+ umulh r15, r4, r9
+ adcs r7, r7, r16
+ umulh r16, r5, r9
+ cinc r17, r17, cs
+ adds r14, r14, r18
+ umulh r18, r6, r9
+ adcs r7, r7, r8
+ mul r8, r4, r9
+ cinc r17, r17, cs
+ adds r14, r14, r19
+ mul r19, r5, r9
+ adcs r7, r7, r15
+ mul r9, r6, r9
+
+ umulh r15, r2, r10
+ adcs r17, r17, r16
+ umulh r16, r3, r10
+ cinc r18, r18, cs
+ adds r14, r14, r20
+ umulh r20, r4, r10
+ adcs r7, r7, r19
+ umulh r19, r5, r10
+ adcs r17, r17, r9
+ umulh r9, r6, r10
+ cinc r18, r18, cs
+ adds r14, r14, r8
+ mul r8, r3, r10
+ adcs r7, r7, r16
+ mul r16, r4, r10
+ adcs r17, r17, r20
+ mul r20, r5, r10
+ adcs r18, r18, r19
+ mul r10, r6, r10
+
+ umulh r19, r1, r11
+ cinc r9, r9, cs
+ adds r14, r14, r15
+ umulh r15, r2, r11
+ adcs r7, r7, r16
+ umulh r16, r3, r11
+ adcs r17, r17, r20
+ umulh r20, r4, r11
+ adcs r18, r18, r10
+ umulh r10, r5, r11
+ cinc r9, r9, cs
+ adds r14, r14, r8
+ umulh r8, r6, r11
+ adcs r7, r7, r15
+ mul r15, r2, r11
+ adcs r17, r17, r16
+ mul r16, r3, r11
+ adcs r18, r18, r20
+ mul r20, r4, r11
+ adcs r9, r9, r10
+ mul r10, r5, r11
+ mul r11, r6, r11
+
+ cinc r8, r8, cs
+ adds r14, r14, r19
+ umulh r19, r0, r12
+ adcs r7, r7, r16
+ umulh r16, r1, r12
+ adcs r17, r17, r20
+ umulh r20, r2, r12
+ adcs r18, r18, r10
+ umulh r10, r3, r12
+ adcs r9, r9, r11
+ umulh r11, r4, r12
+ cinc r8, r8, cs
+ adds r14, r14, r15
+ umulh r15, r5, r12
+ adcs r7, r7, r16
+ umulh r16, r6, r12
+ adcs r17, r17, r20
+ mul r20, r1, r12
+ adcs r18, r18, r10
+ mul r10, r2, r12
+ adcs r9, r9, r11
+ mul r11, r3, r12
+ adcs r8, r8, r15
+ mul r15, r4, r12
+ cinc r16, r16, cs
+ adds r14, r14, r19
+ mul r19, r5, r12
+ mul r12, r6, r12
+
+ adcs r7, r7, r10
+ umulh r10, r0, r13
+ adcs r17, r17, r11
+ umulh r11, r1, r13
+ adcs r18, r18, r15
+ umulh r15, r2, r13
+ adcs r9, r9, r19
+ umulh r19, r3, r13
+ adcs r8, r8, r12
+ umulh r12, r4, r13
+ cinc r16, r16, cs
+ adds r14, r14, r20
+ umulh r20, r5, r13
+ adcs r7, r7, r10
+ umulh r10, r6, r13
+ mul r0, r0, r13
+ adcs r17, r17, r11
+ mul r1, r1, r13
+ mul r2, r2, r13
+ adcs r18, r18, r15
+ mul r3, r3, r13
+ mul r4, r4, r13
+ adcs r9, r9, r19
+ mul r5, r5, r13
+ mul r6, r6, r13
+ adcs r8, r8, r12
+ adcs r16, r16, r20
+ ldp r19, r20, [sp, #2*8]
+ cinc r10, r10, cs
+ adds r14, r14, r0
+ adcs r7, r7, r1
+ adcs r11, r17, r2
+ adcs r15, r18, r3
+ adcs r9, r9, r4
+ stp r7, r11, [rp]
+ ldp r17, r18, [sp], #4*8
+ adcs r8, r8, r5
+ stp r15, r9, [rp, #2*8]
+ adcs r16, r16, r6
+ cinc r10, r10, cs
+ stp r8, r16, [rp, #4*8]
+ str r10, [rp, #6*8]
+ mov res, r14
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_mulhigh_8)
+ ldp r4, r5, [ap, #4*8]
+ ldp r6, r7, [ap, #6*8]
+ ldp r8, r9, [bp]
+ stp r17, r18, [sp, #-6*8]!
+
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r10, r11, [bp, #2*8]
+ stp r19, r20, [sp, #2*8]
+
+ ldp r12, r13, [bp, #4*8]
+ ldp r14, r15, [bp, #6*8]
+ stp r21, r22, [sp, #4*8]
+
+ umulh r16, r6, r8
+ mul r17, r7, r8
+ umulh r8, r7, r8
+ umulh r18, r6, r9
+ umulh r19, r7, r9
+ umulh r20, r5, r9
+ mul r21, r6, r9
+ mul r9, r7, r9
+ C (16, 17, 20, 21), (8, 18, 9), 19
+ C Free: 22
+
+ umulh r22, r4, r10
+ adds r16, r16, r17
+ umulh r17, r5, r10
+ adcs r8, r8, r18
+ umulh r18, r6, r10
+ cinc r19, r19, cs
+ adds r16, r16, r20
+ umulh r20, r7, r10
+ adcs r8, r8, r9
+ mul r9, r5, r10
+ cinc r19, r19, cs
+ adds r16, r16, r21
+ mul r21, r6, r10
+ mul r10, r7, r10
+
+ adcs r8, r8, r17
+ umulh r17, r3, r11
+ adcs r19, r19, r18
+ umulh r18, r4, r11
+ cinc r20, r20, cs
+ adds r16, r16, r22
+ umulh r22, r5, r11
+ adcs r8, r8, r21
+ umulh r21, r6, r11
+ adcs r19, r19, r10
+ umulh r10, r7, r11
+ cinc r20, r20, cs
+ adds r16, r16, r9
+ mul r9, r4, r11
+ adcs r8, r8, r18
+ mul r18, r5, r11
+ adcs r19, r19, r22
+ mul r22, r6, r11
+ mul r11, r7, r11
+
+ adcs r20, r20, r21
+ umulh r21, r2, r12
+ cinc r10, r10, cs
+ adds r16, r16, r17
+ umulh r17, r3, r12
+ adcs r8, r8, r18
+ umulh r18, r4, r12
+ adcs r19, r19, r22
+ umulh r22, r5, r12
+ adcs r20, r20, r11
+ umulh r11, r6, r12
+ cinc r10, r10, cs
+ adds r16, r16, r9
+ umulh r9, r7, r12
+ adcs r8, r8, r17
+ mul r17, r3, r12
+ adcs r19, r19, r18
+ mul r18, r4, r12
+ adcs r20, r20, r22
+ mul r22, r5, r12
+ adcs r10, r10, r11
+ mul r11, r6, r12
+ mul r12, r7, r12
+
+ cinc r9, r9, cs
+ adds r16, r16, r21
+ umulh r21, r1, r13
+ adcs r8, r8, r18
+ umulh r18, r2, r13
+ adcs r19, r19, r22
+ umulh r22, r3, r13
+ adcs r20, r20, r11
+ umulh r11, r4, r13
+ adcs r10, r10, r12
+ umulh r12, r5, r13
+ cinc r9, r9, cs
+ adds r16, r16, r17
+ umulh r17, r6, r13
+ adcs r8, r8, r18
+ umulh r18, r7, r13
+ adcs r19, r19, r22
+ mul r22, r2, r13
+ adcs r20, r20, r11
+ mul r11, r3, r13
+ adcs r10, r10, r12
+ mul r12, r4, r13
+ adcs r9, r9, r17
+ mul r17, r5, r13
+ cinc r18, r18, cs
+ adds r16, r16, r21
+ mul r21, r6, r13
+ mul r13, r7, r13
+
+ adcs r8, r8, r11
+ umulh r11, r0, r14
+ adcs r19, r19, r12
+ umulh r12, r1, r14
+ adcs r20, r20, r17
+ umulh r17, r2, r14
+ adcs r10, r10, r21
+ umulh r21, r3, r14
+ adcs r9, r9, r13
+ umulh r13, r4, r14
+ cinc r18, r18, cs
+ adds r16, r16, r22
+ umulh r22, r5, r14
+ adcs r8, r8, r12
+ umulh r12, r6, r14
+ adcs r19, r19, r17
+ umulh r17, r7, r14
+ adcs r20, r20, r21
+ mul r21, r1, r14
+ adcs r10, r10, r13
+ mul r13, r2, r14
+ adcs r9, r9, r22
+ mul r22, r3, r14
+ adcs r18, r18, r12
+ mul r12, r4, r14
+ cinc r17, r17, cs
+ adds r16, r16, r11
+ mul r11, r5, r14
+ adcs r8, r8, r13
+ mul r13, r6, r14
+ mul r14, r7, r14
+
+ adcs r19, r19, r22
+ umulh r22, r0, r15
+ adcs r20, r20, r12
+ umulh r12, r1, r15
+ adcs r10, r10, r11
+ umulh r11, r2, r15
+ adcs r9, r9, r13
+ umulh r13, r3, r15
+ adcs r18, r18, r14
+ umulh r14, r4, r15
+ cinc r17, r17, cs
+ adds r16, r16, r21
+ umulh r21, r5, r15
+ adcs r8, r8, r22
+ umulh r22, r6, r15
+ adcs r19, r19, r12
+ umulh r12, r7, r15
+ mul r0, r0, r15
+ adcs r20, r20, r11
+ mul r1, r1, r15
+ adcs r10, r10, r13
+ mul r2, r2, r15
+ adcs r9, r9, r14
+ mul r3, r3, r15
+ adcs r18, r18, r21
+ mul r4, r4, r15
+ adcs r17, r17, r22
+ mul r5, r5, r15
+ cinc r12, r12, cs
+ ldp r21, r22, [sp, #4*8]
+ mul r6, r6, r15
+ adds r16, r16, r0
+ mul r7, r7, r15
+ adcs r8, r8, r1
+ adcs r0, r19, r2
+ adcs r1, r20, r3
+ stp r8, r0, [rp]
+ ldp r19, r20, [sp, #2*8]
+ adcs r10, r10, r4
+ adcs r9, r9, r5
+ stp r1, r10, [rp, #2*8]
+ adcs r6, r18, r6
+ adcs r7, r17, r7
+ ldp r17, r18, [sp], #6*8
+ stp r9, r6, [rp, #4*8]
+ cinc r12, r12, cs
+ stp r7, r12, [rp, #6*8]
+ mov res, r16
+ ret
+EPILOGUE()
diff --git a/src/mpn_extras/arm64/sqr_hard.asm b/src/mpn_extras/arm64/sqr_hard.asm
new file mode 100644
index 0000000000..4fe56fbdc0
--- /dev/null
+++ b/src/mpn_extras/arm64/sqr_hard.asm
@@ -0,0 +1,1376 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version. See .
+dnl
+
+include(`config.m4')
+
+define(`rp', `x0')
+define(`ap', `x1')
+
+define(`r0', `x2')
+define(`r1', `x3')
+define(`r2', `x4')
+define(`r3', `x5')
+define(`r4', `x6')
+define(`r5', `x7')
+define(`r6', `x8')
+define(`r7', `x9')
+define(`r8', `x10')
+define(`r9', `x11')
+define(`r10', `x12')
+define(`r11', `x13')
+define(`r12', `x14')
+define(`r13', `x15')
+define(`r14', `x16')
+define(`r15', `x17')
+
+define(`zr', `xzr')
+
+define(`res', `rp') dnl NOTE: Synonymous with rp!
+
+define(`r16', `ap') dnl Beware!
+
+dnl NOTE: The following has to be pushed and popped
+dnl NOTE: At least on Apple, we need to preserve both x18 and x29 at all times!
+define(`r17', `x19')
+define(`r18', `x20')
+define(`r19', `x21')
+define(`r20', `x22')
+define(`r21', `x23')
+define(`r22', `x24')
+define(`r23', `x25')
+define(`r24', `x26')
+define(`r25', `x27')
+define(`r26', `x28')
+define(`r27', `x30')
+
+PROLOGUE(flint_mpn_sqr_1)
+ ldr r0, [ap]
+
+ mul r1, r0, r0
+ umulh r2, r0, r0
+
+ stp r1, r2, [rp]
+ mov res, r2
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_sqr_2)
+ ldp r0, r1, [ap]
+
+ mul r2, r0, r1
+ umulh r3, r0, r1
+ C 2, 3
+
+ mul r4, r0, r0
+ umulh r0, r0, r0
+ mul r5, r1, r1
+ umulh r1, r1, r1
+ C 4, 0, 5, 1
+
+ adds r2, r2, r2
+ adcs r3, r3, r3
+ cset r6, cs
+ C 2, 3, 6
+
+ adds r0, r0, r2
+ adcs r5, r5, r3
+ adc r1, r1, r6
+ C 4, 0, 5, 1
+
+ stp r4, r0, [rp]
+ stp r5, r1, [rp, #2*8]
+ mov res, r1
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_sqr_3)
+ ldp r0, r1, [ap]
+ ldr r2, [ap, #2*8]
+
+ mul r3, r0, r1
+ umulh r4, r0, r1
+ mul r5, r0, r2
+ umulh r6, r0, r2
+ mul r7, r1, r2
+ umulh r8, r1, r2
+ C 3, (4, 5), (6, 7), 8
+
+ mul r9, r0, r0
+ umulh r0, r0, r0
+ mul r10, r1, r1
+ umulh r1, r1, r1
+
+ adds r4, r4, r5
+ adcs r6, r6, r7
+ cinc r8, r8, cs
+ C 3, 4, 6, 8
+
+ mul r11, r2, r2
+ umulh r2, r2, r2
+ C 9, 0, 10, 1, 11, 2
+
+ adds r3, r3, r3
+ adcs r4, r4, r4
+ adcs r6, r6, r6
+ adcs r8, r8, r8
+ cset r12, cs
+ C 3, 4, 6, 8, 12
+
+ C 3, 4, 6, 8, 12
+ C 9, 0, 10, 1, 11, 2
+ adds r0, r0, r3
+
+ adcs r10, r10, r4
+ adcs r1, r1, r6
+ adcs r11, r11, r8
+
+ stp r9, r0, [rp]
+
+ adc r2, r2, r12
+
+ stp r10, r1, [rp, #2*8]
+ stp r11, r2, [rp, #4*8]
+ mov res, r2
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_sqr_4)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+
+ mul r4, r0, r1
+ umulh r5, r0, r1
+ mul r6, r0, r2
+ umulh r7, r0, r2
+ mul r8, r0, r3
+ umulh r9, r0, r3
+ umulh r10, r1, r2
+ umulh r11, r1, r3
+ C 4, (5, 6), (7, 8), (9, 10), 11
+
+ mul r12, r1, r2
+ mul r13, r1, r3
+
+ adds r5, r5, r6
+
+ mul r14, r2, r3
+ umulh r15, r2, r3
+
+ adcs r7, r7, r8
+
+ mul r6, r0, r0
+ umulh r0, r0, r0
+
+ adcs r9, r9, r10
+
+ mul r8, r1, r1
+ umulh r1, r1, r1
+
+ cinc r11, r11, cs
+
+ adds r7, r7, r12
+ adcs r9, r9, r13
+
+ mul r10, r2, r2
+ umulh r2, r2, r2
+
+ adcs r11, r11, r14
+ cinc r15, r15, cs
+ C 4, 5, 7, 9, 11, 15
+
+ adds r4, r4, r4
+
+ mul r12, r3, r3
+ umulh r3, r3, r3
+ C 6, 0, 8, 1, 10, 2, 12, 3
+
+ adcs r5, r5, r5
+ adcs r7, r7, r7
+ adcs r9, r9, r9
+ adcs r11, r11, r11
+ adcs r15, r15, r15
+ cset r13, cs
+ C 4, 5, 7, 9, 11, 15, 13
+
+ C 4, 5, 7, 9, 11, 15, 13
+ C 6, 0, 8, 1, 10, 2, 12, 3
+ adds r0, r0, r4
+ adcs r8, r8, r5
+ adcs r1, r1, r7
+ stp r6, r0, [rp]
+ adcs r10, r10, r9
+ adcs r2, r2, r11
+ adcs r12, r12, r15
+ stp r8, r1, [rp, #2*8]
+ adc r3, r3, r13
+ stp r10, r2, [rp, #4*8]
+ stp r12, r3, [rp, #6*8]
+ mov res, r3
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_sqr_5)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldr r4, [ap, #4*8]
+
+ stp r17, r18, [sp, #-2*8]!
+
+ mul r5, r0, r1
+ umulh r6, r0, r1
+ mul r7, r0, r2
+ umulh r8, r0, r2
+ mul r9, r0, r3
+ umulh r10, r0, r3
+ mul r11, r0, r4
+ umulh r12, r0, r4
+ umulh r13, r1, r3
+ umulh r14, r1, r4
+
+ adds r6, r6, r7
+
+ mul r15, r1, r2
+ umulh r16, r1, r2
+
+ adcs r8, r8, r9
+
+ mul r17, r1, r4
+ umulh r18, r2, r3
+
+ adcs r10, r10, r11
+
+ mul r7, r1, r3
+ mul r9, r2, r3
+
+ adcs r12, r12, r13
+ cinc r14, r14, cs
+
+ mul r11, r2, r4
+ umulh r13, r2, r4
+
+ adds r8, r8, r15
+ adcs r10, r10, r16
+
+ mul r15, r3, r4
+ umulh r16, r3, r4
+
+ adcs r12, r12, r17
+ adcs r14, r14, r18
+ cinc r13, r13, cs
+ C 5, 6, 8, (10, 7), (12, 9), (14, 11), (13, 15), 16
+ C 17, 18
+
+ mul r17, r0, r0
+ umulh r0, r0, r0
+
+ adds r10, r10, r7
+ adcs r12, r12, r9
+
+ mul r18, r1, r1
+ umulh r1, r1, r1
+
+ adcs r14, r14, r11
+
+ mul r7, r2, r2
+ umulh r2, r2, r2
+
+ adcs r13, r13, r15
+ cinc r16, r16, cs
+ C 5, 6, 8, 10, 12, 14, 13, 16
+ C 17, 0, 18, 1, 7, 2
+ C Free: 9, 11, 15
+
+ adds r5, r5, r5
+
+ mul r9, r3, r3
+ umulh r3, r3, r3
+
+ adcs r6, r6, r6
+ adcs r8, r8, r8
+ adcs r10, r10, r10
+
+ mul r11, r4, r4
+ umulh r4, r4, r4
+
+ adcs r12, r12, r12
+ adcs r14, r14, r14
+ adcs r13, r13, r13
+ adcs r16, r16, r16
+ cset r15, cs
+ C 5, 6, 8, 10, 12, 14, 13, 16, 15
+ C 17, 0, 18, 1, 7, 2, 9, 3, 11, 4
+
+ adds r0, r0, r5
+ adcs r6, r6, r18
+ adcs r1, r1, r8
+ adcs r7, r7, r10
+ stp r17, r0, [rp]
+ adcs r2, r2, r12
+ adcs r9, r9, r14
+ adcs r3, r3, r13
+ stp r6, r1, [rp, #2*8]
+ adcs r11, r11, r16
+ adc r4, r4, r15
+ stp r7, r2, [rp, #4*8]
+ ldp r17, r18, [sp], #2*8
+ stp r9, r3, [rp, #6*8]
+ stp r11, r4, [rp, #8*8]
+
+ mov res, r4
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_sqr_6)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+
+ stp r17, r18, [sp, #-4*8]!
+ stp r19, r20, [sp, #2*8]
+
+ C mul rX, r0, r1 First computed when doubling
+ umulh r6, r0, r1
+ mul r7, r0, r2
+ umulh r8, r0, r2
+ mul r9, r0, r3
+ umulh r10, r0, r3
+ mul r11, r0, r4
+ umulh r12, r0, r4
+ mul r13, r0, r5
+
+ adds r6, r6, r7
+
+ umulh r14, r0, r5
+ C 6, (8 + cs, 9), (10, 11), (12, 13), 14
+
+ mul r15, r1, r2
+
+ adcs r8, r8, r9
+
+ mul r16, r1, r3
+ mul r17, r1, r4
+
+ adcs r10, r10, r11
+
+ mul r18, r1, r5
+ umulh r19, r1, r5
+
+ adcs r12, r12, r13
+
+ umulh r20, r1, r2
+ umulh r7, r1, r3
+
+ cinc r14, r14, cs
+
+ umulh r9, r1, r4
+ C 6, (8, 15), (10, 16, 20), (12, 17, 7), (14, 18, 9), 19
+
+ umulh r11, r2, r4
+
+ adds r8, r8, r15
+ adcs r10, r10, r16
+ adcs r12, r12, r17
+
+ umulh r13, r2, r5
+ mul r15, r2, r3
+
+ adcs r14, r14, r18
+ cinc r19, r19, cs
+
+ umulh r16, r2, r3
+ mul r17, r2, r5
+
+ adds r10, r10, r20
+ adcs r12, r12, r7
+ C 6, 8, 10, (12, 15), (14 + cs, 9, 16), (19, 11, 17), 13
+
+ umulh r18, r3, r4
+ umulh r20, r3, r5
+
+ adcs r14, r14, r9
+ adcs r19, r19, r11
+
+ mul r7, r2, r4
+ mul r9, r3, r4
+
+ cinc r13, r13, cs
+ adds r12, r12, r15
+
+ mul r11, r3, r5
+ mul r15, r4, r5
+
+ adcs r14, r14, r16
+ adcs r19, r19, r17
+
+ umulh r16, r4, r5
+ mul r17, r0, r1
+
+ adcs r13, r13, r18
+ cinc r20, r20, cs
+ C 17, 6, 8, 10, 12, (14, 7), (19, 9), (13, 11), (20, 15), 16
+ C Free: 18
+
+ mul r18, r0, r0
+ umulh r0, r0, r0
+
+ adds r14, r14, r7
+ adcs r19, r19, r9
+
+ mul r7, r1, r1
+ umulh r1, r1, r1
+
+ adcs r13, r13, r11
+ adcs r20, r20, r15
+
+ mul r9, r2, r2
+ umulh r2, r2, r2
+
+ cinc r16, r16, cs
+ C 17, 6, 8, 10, 12, 14, 19, 13, 20, 16
+ C 18, 0, 7, 1, 9, 2
+ C Free: 11, 15
+
+ adds r17, r17, r17
+ adcs r6, r6, r6
+
+ mul r11, r3, r3
+ umulh r3, r3, r3
+
+ adcs r8, r8, r8
+ adcs r10, r10, r10
+ adcs r12, r12, r12
+ adcs r14, r14, r14
+ lsr r15, r16, #63
+ adcs r19, r19, r19
+ adcs r13, r13, r13
+ adcs r20, r20, r20
+ adc r16, r16, r16
+ C 17, 6, 8, 10, 12, 14, 19, 13, 20, 16, 15
+ C 18, 0, 7, 1, 9, 2, 11, 3
+ C Free:
+
+ adds r0, r0, r17
+
+ mul r17, r4, r4
+ umulh r4, r4, r4
+
+ adcs r7, r7, r6
+ adcs r1, r1, r8
+ adcs r9, r9, r10
+
+ stp r18, r0, [rp]
+
+ mul r6, r5, r5
+ umulh r5, r5, r5
+
+ adcs r2, r2, r12
+ adcs r11, r11, r14
+ adcs r3, r3, r19
+
+ stp r7, r1, [rp, #2*8]
+
+ adcs r13, r13, r17
+ adcs r4, r4, r20
+
+ stp r9, r2, [rp, #4*8]
+
+ ldp r19, r20, [sp, #2*8]
+
+ stp r11, r3, [rp, #6*8]
+
+ adcs r6, r6, r16
+ adcs r5, r5, r15
+
+ stp r13, r4, [rp, #8*8]
+
+ ldp r17, r18, [sp], #4*8
+
+ stp r6, r5, [rp, #10*8]
+
+ mov res, r5
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_sqr_7)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+ stp r17, r18, [sp, #-6*8]!
+
+ ldr r6, [ap, #6*8]
+ stp r19, r20, [sp, #2*8]
+ stp r21, r22, [sp, #4*8]
+
+ C mul rX, r0, r1 First computed when doubling
+ umulh r7, r0, r1
+ mul r8, r0, r2
+ umulh r9, r0, r2
+ mul r10, r0, r3
+ umulh r11, r0, r3
+ mul r12, r0, r4
+ umulh r13, r0, r4
+ mul r14, r0, r5
+
+ adds r7, r7, r8
+
+ umulh r15, r0, r5
+ mul r16, r0, r6
+
+ adcs r9, r9, r10
+
+ umulh r17, r0, r6
+ C 7, 9, (11 + cs, 12), (13, 14), (15, 16), 17
+
+ mul r18, r1, r2
+
+ adcs r11, r11, r12
+
+ mul r19, r1, r3
+ mul r20, r1, r4
+
+ adcs r13, r13, r14
+
+ mul r21, r1, r5
+ mul r22, r1, r6
+
+ adcs r15, r15, r16
+
+ umulh r8, r1, r6
+ umulh r10, r1, r2
+
+ cinc r17, r17, cs
+
+ umulh r12, r1, r3
+ umulh r14, r1, r4
+
+ adds r9, r9, r18
+
+ umulh r16, r1, r5
+ mul r18, r2, r3
+ C 7, 9, (11 + cs, 19, 10), (13, 20, 12, 18), (15, 21, 14), (17, 22, 16), 8
+ C Free:
+
+ adcs r11, r11, r19
+ adcs r13, r13, r20
+
+ mul r19, r2, r4
+ mul r20, r2, r5
+
+ adcs r15, r15, r21
+ adcs r17, r17, r22
+
+ mul r21, r2, r6
+ umulh r22, r2, r6
+
+ cinc r8, r8, cs
+
+ adds r11, r11, r10
+ adcs r13, r13, r12
+
+ umulh r10, r2, r3
+ umulh r12, r2, r4
+
+ adcs r15, r15, r14
+ adcs r17, r17, r16
+
+ umulh r14, r2, r5
+ umulh r16, r3, r5
+
+ cinc r8, r8, cs
+
+ adds r13, r13, r18
+ adcs r15, r15, r19
+ C 7, 9, 11, 13, (15, 10), (17 + cs, 20, 12), (8, 21, 14), (22, 16)
+
+ umulh r18, r3, r6
+ mul r19, r3, r4
+
+ adcs r17, r17, r20
+ adcs r8, r8, r21
+
+ umulh r20, r3, r4
+ mul r21, r3, r6
+
+ cinc r22, r22, cs
+
+ adds r15, r15, r10
+ adcs r17, r17, r12
+
+ umulh r10, r4, r5
+ umulh r12, r4, r6
+
+ adcs r8, r8, r14
+ adcs r22, r22, r16
+
+ mul r14, r3, r5
+ mul r16, r4, r5
+
+ cinc r18, r18, cs
+
+ adds r17, r17, r19
+ adcs r8, r8, r20
+
+ mul r19, r4, r6
+ mul r20, r5, r6
+
+ adcs r22, r22, r21
+ adcs r18, r18, r10
+
+ umulh r21, r5, r6
+ mul r10, r0, r1
+
+ cinc r12, r12, cs
+ C 10, 7, 9, 11, 13, 15, 17, (8, 14), (22, 16), (18, 19), (12, 20), 21
+
+ adds r8, r8, r14
+ adcs r22, r22, r16
+
+ mul r14, r0, r0
+ umulh r0, r0, r0
+
+ adcs r18, r18, r19
+ adcs r12, r12, r20
+ cinc r21, r21, cs
+
+ mul r16, r1, r1
+ umulh r1, r1, r1
+ C 10, 7, 9, 11, 13, 15, 17, 8, 22, 18, 12, 21
+ C 14, 0, 16, 1
+ C Free: 19, 20
+
+ adds r10, r10, r10
+ adcs r7, r7, r7
+ adcs r9, r9, r9
+ adcs r11, r11, r11
+ adcs r13, r13, r13
+
+ mul r20, r2, r2
+ umulh r2, r2, r2
+
+ adcs r15, r15, r15
+ adcs r17, r17, r17
+ adcs r8, r8, r8
+ lsr r19, r21, #63
+ adcs r22, r22, r22
+ adcs r18, r18, r18
+ adcs r12, r12, r12
+ adc r21, r21, r21
+ C 10, 7, 9, 11, 13, 15, 17, 8, 22, 18, 12, 21, 19
+ C 14, 0, 16, 1, 20, 2
+ C Free:
+
+ adds r0, r0, r10
+
+ mul r10, r3, r3
+ umulh r3, r3, r3
+
+ adcs r16, r16, r7
+ adcs r1, r1, r9
+
+ stp r14, r0, [rp]
+
+ mul r7, r4, r4
+ umulh r4, r4, r4
+
+ adcs r20, r20, r11
+ adcs r2, r2, r13
+
+ stp r16, r1, [rp, #2*8]
+
+ mul r9, r5, r5
+ umulh r5, r5, r5
+
+ adcs r10, r10, r15
+ adcs r3, r3, r17
+
+ stp r20, r2, [rp, #4*8]
+
+ mul r14, r6, r6
+ umulh r6, r6, r6
+
+ adcs r7, r7, r8
+ adcs r4, r4, r22
+
+ stp r10, r3, [rp, #6*8]
+
+ adcs r9, r9, r18
+ adcs r5, r5, r12
+ adcs r14, r14, r21
+ adcs r6, r6, r19
+
+ stp r7, r4, [rp, #8*8]
+ ldp r19, r20, [sp, #2*8]
+ stp r9, r5, [rp, #10*8]
+ ldp r21, r22, [sp, #4*8]
+ stp r14, r6, [rp, #12*8]
+ ldp r17, r18, [sp], #6*8
+
+ mov res, r6
+
+ ret
+EPILOGUE()
+
+dnl From a{n - 4} forward, use the following sequence:
+dnl
+dnl umulh rX, r2, r4
+dnl umulh rX, r2, r5
+dnl mul rX, r2, r3
+dnl umulh rX, r2, r3
+dnl mul rX, r2, r5
+dnl umulh rX, r3, r4
+dnl umulh rX, r3, r5
+dnl mul rX, r2, r4
+dnl mul rX, r3, r4
+dnl mul rX, r3, r5
+dnl mul rX, r4, r5
+dnl umulh rX, r4, r5
+dnl mul rX, r0, r1
+
+PROLOGUE(flint_mpn_sqr_8)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+ stp r17, r18, [sp, #-10*8]!
+ stp r19, r20, [sp, #2*8]
+ ldp r6, r7, [ap, #6*8]
+ stp r21, r22, [sp, #4*8]
+ stp r23, r24, [sp, #6*8]
+ stp r25, r26, [sp, #8*8]
+
+ C mul rX, r0, r1 Calculate this one later
+ umulh r8, r0, r1
+ mul r9, r0, r2
+ umulh r10, r0, r2
+ mul r11, r0, r3
+ umulh r12, r0, r3
+ mul r13, r0, r4
+ umulh r14, r0, r4
+ mul r15, r0, r5
+
+ adds r8, r8, r9
+
+ umulh r16, r0, r5
+ mul r17, r0, r6
+
+ adcs r10, r10, r11
+
+ umulh r18, r0, r6
+ mul r19, r0, r7
+
+ adcs r12, r12, r13
+
+ umulh r20, r0, r7
+ mul r21, r1, r2
+
+ adcs r14, r14, r15
+
+ mul r22, r1, r3
+ mul r23, r1, r4
+
+ adcs r16, r16, r17
+
+ mul r24, r1, r5
+ mul r25, r1, r6
+
+ adcs r18, r18, r19
+
+ mul r26, r1, r7
+ umulh r9, r1, r7
+
+ cinc r20, r20, cs
+ C 8, (10, 21), (12, 22), (14, 23), (16, 24), (18, 25), (20, 26), 9
+ C Free: 11, 13, 15, 17, 19
+
+ umulh r11, r1, r2
+ umulh r13, r1, r3
+
+ adds r10, r10, r21
+ adcs r12, r12, r22
+
+ umulh r15, r1, r4
+ umulh r17, r1, r5
+
+ adcs r14, r14, r23
+ adcs r16, r16, r24
+
+ umulh r19, r1, r6
+ mul r21, r2, r3
+
+ adcs r18, r18, r25
+ adcs r20, r20, r26
+
+ mul r22, r2, r4
+ mul r23, r2, r5
+
+ cinc r9, r9, cs
+ C 8, 10, (12, 11), (14, 13, 21), (16, 15, 22), (18, 17, 23), (20, 19), 9
+ C Free: 24, 25, 26
+
+ adds r12, r12, r11
+
+ mul r24, r2, r6
+ mul r25, r2, r7
+
+ adcs r14, r14, r13
+ adcs r16, r16, r15
+
+ umulh r26, r2, r7
+ umulh r11, r2, r3
+
+ adcs r18, r18, r17
+ adcs r20, r20, r19
+
+ umulh r13, r2, r4
+ umulh r15, r2, r5
+
+ cinc r9, r9, cs
+ C 8, 10, 12, (14, 21), (16, 22, 11), (18, 23, 13), (20, 24, 15), (9, 25), 26
+ C Free: 17, 19
+
+ adds r14, r14, r21
+
+ umulh r17, r2, r6
+ mul r19, r3, r4
+
+ adcs r16, r16, r22
+ adcs r18, r18, r23
+
+ mul r21, r3, r5
+ mul r22, r3, r6
+
+ adcs r20, r20, r24
+ adcs r9, r9, r25
+
+ mul r23, r3, r7
+ umulh r24, r3, r7
+
+ cinc r26, r26, cs
+ C 8, 10, 12, 14, (16, 11), (18, 13, 19), (20, 15, 21), (9, 17, 22), (26, 23), 24
+ C Free: 25
+
+ adds r16, r16, r11
+
+ umulh r25, r3, r4
+ umulh r11, r3, r5
+
+ adcs r18, r18, r13
+ adcs r20, r20, r15
+
+ umulh r13, r3, r6
+ umulh r15, r4, r6
+
+ adcs r9, r9, r17
+ cinc r26, r26, cs
+ C 8, 10, 12, 14, 16, (18, 19), (20, 21, 25), (9, 22, 11), (26, 23, 13), (24, 15)
+ C Free: 17
+
+ adds r18, r18, r19
+
+ umulh r17, r4, r7
+ mul r19, r4, r5
+
+ adcs r20, r20, r21
+ adcs r9, r9, r22
+
+ umulh r21, r4, r5
+ mul r22, r4, r7
+
+ adcs r26, r26, r23
+ cinc r24, r24, cs
+ C 8, 10, 12, 14, 16, 18, (20, 25), (9, 11, 19), (26, 13, 21), (24, 15, 22), 17
+ C Free: 23
+
+ adds r20, r20, r25
+
+ umulh r23, r5, r6
+ umulh r25, r5, r7
+
+ adcs r9, r9, r11
+ adcs r26, r26, r13
+
+ mul r11, r4, r6
+ mul r13, r5, r6
+
+ adcs r24, r24, r15
+ cinc r17, r17, cs
+ C 8, 10, 12, 14, 16, 18, 20, (9, 19), (26, 21, 11), (24, 22, 13), (17, 23), 25
+ C Free: 15
+
+ adds r9, r9, r19
+
+ mul r15, r5, r7
+ mul r19, r6, r7
+
+ adcs r26, r26, r21
+ adcs r24, r24, r22
+
+ umulh r21, r6, r7
+ mul r22, r0, r1
+
+ adcs r17, r17, r23
+ cinc r25, r25, cs
+
+ mul r23, r0, r0
+ umulh r0, r0, r0
+
+ adds r26, r26, r11
+ adcs r24, r24, r13
+
+ mul r11, r1, r1
+ umulh r1, r1, r1
+
+ adcs r17, r17, r15
+ adcs r25, r25, r19
+
+ mul r13, r2, r2
+ umulh r2, r2, r2
+
+ cinc r21, r21, cs
+ C 22, 8, 10, 12, 14, 16, 18, 20, 9, 26, 24, 17, 25, 21
+ C 23, 0, 11, 1, 13, 2
+ C Free: 15, 19
+
+ adds r22, r22, r22
+ adcs r8, r8, r8
+ adcs r10, r10, r10
+ adcs r12, r12, r12
+ adcs r14, r14, r14
+ adcs r16, r16, r16
+
+ mul r19, r3, r3
+ umulh r3, r3, r3
+
+ adcs r18, r18, r18
+ adcs r20, r20, r20
+ adcs r9, r9, r9
+ lsr r15, r21, #63
+ adcs r26, r26, r26
+ adcs r24, r24, r24
+ adcs r17, r17, r17
+ adcs r25, r25, r25
+ adc r21, r21, r21
+ C 22, 8, 10, 12, 14, 16, 18, 20, 9, 26, 24, 17, 25, 21, 15
+ C 23, 0, 11, 1, 13, 2, 19, 3
+ C Free:
+
+ adds r0, r0, r22
+
+ mul r22, r4, r4
+ umulh r4, r4, r4
+
+ adcs r11, r11, r8
+ adcs r1, r1, r10
+
+ stp r23, r0, [rp]
+
+ mul r8, r5, r5
+ umulh r5, r5, r5
+
+ adcs r13, r13, r12
+ adcs r2, r2, r14
+
+ stp r11, r1, [rp, #2*8]
+
+ mul r10, r6, r6
+ umulh r6, r6, r6
+
+ adcs r16, r16, r19
+ adcs r3, r3, r18
+
+ stp r13, r2, [rp, #4*8]
+
+ mul r12, r7, r7
+ umulh r7, r7, r7
+
+ adcs r22, r22, r20
+ adcs r4, r4, r9
+
+ stp r16, r3, [rp, #6*8]
+ ldp r19, r20, [sp, #2*8]
+
+ adcs r8, r8, r26
+ adcs r5, r5, r24
+
+ stp r22, r4, [rp, #8*8]
+ ldp r23, r24, [sp, #6*8]
+
+ adcs r10, r10, r17
+ adcs r6, r6, r25
+
+ stp r8, r5, [rp, #10*8]
+ ldp r25, r26, [sp, #8*8]
+
+ adcs r12, r12, r21
+ adc r7, r7, r15
+
+ stp r10, r6, [rp, #12*8]
+ ldp r21, r22, [sp, #4*8]
+ stp r12, r7, [rp, #14*8]
+ ldp r17, r18, [sp], #10*8
+
+ mov res, r7
+
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_sqr_9)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+ stp r17, r18, [sp, #-12*8]!
+ stp r19, r20, [sp, #2*8]
+ ldp r6, r7, [ap, #6*8]
+ ldr r8, [ap, #8*8]
+ stp r21, r22, [sp, #4*8]
+ stp r23, r24, [sp, #6*8]
+ stp r25, r26, [sp, #8*8]
+ str r27, [sp, #10*8]
+
+ C mul rX, r0, r1 Calculated later
+ umulh r9, r0, r1
+ mul r10, r0, r2
+ umulh r11, r0, r2
+ mul r12, r0, r3
+ umulh r13, r0, r3
+ mul r14, r0, r4
+ umulh r15, r0, r4
+ mul r16, r0, r5
+
+ adds r9, r9, r10
+
+ umulh r17, r0, r5
+ mul r18, r0, r6
+
+ adcs r11, r11, r12
+
+ umulh r19, r0, r6
+ mul r20, r0, r7
+
+ adcs r13, r13, r14
+
+ umulh r21, r0, r7
+ mul r22, r0, r8
+
+ adcs r15, r15, r16
+
+ umulh r23, r0, r8
+ mul r24, r1, r2
+
+ adcs r17, r17, r18
+
+ mul r25, r1, r3
+ mul r26, r1, r4
+
+ adcs r19, r19, r20
+
+ mul r27, r1, r5
+ mul r10, r1, r6
+
+ adcs r21, r21, r22
+
+ mul r12, r1, r7
+ mul r14, r1, r8
+
+ cinc r23, r23, cs
+ C 9, (11, 24), (13, 25), (15, 26), (17, 27), (19, 10), (21, 12), (23, 14)
+ C Free: 16, 18, 20, 22
+
+ umulh r16, r1, r8
+ umulh r18, r1, r2
+
+ adds r11, r11, r24
+ adcs r13, r13, r25
+
+ umulh r20, r1, r3
+ umulh r22, r1, r4
+
+ adcs r15, r15, r26
+ adcs r17, r17, r27
+
+ umulh r24, r1, r5
+ umulh r25, r1, r6
+
+ adcs r19, r19, r10
+ adcs r21, r21, r12
+
+ umulh r26, r1, r7
+ mul r27, r2, r3
+
+ adcs r23, r23, r14
+ cinc r16, r16, cs
+ C 9, 11, (13, 18), (15, 20, 27), (17, 22), (19, 24), (21, 25), (23, 26), 16
+ C Free: 10, 12, 14
+
+ mul r10, r2, r4
+ mul r12, r2, r5
+
+ adds r13, r13, r18
+ adcs r15, r15, r20
+
+ mul r14, r2, r6
+ mul r18, r2, r7
+
+ adcs r17, r17, r22
+ adcs r19, r19, r24
+
+ mul r20, r2, r8
+ umulh r22, r2, r8
+
+ adcs r21, r21, r25
+ adcs r23, r23, r26
+
+ umulh r24, r2, r3
+ umulh r25, r2, r4
+
+ cinc r16, r16, cs
+ C 9, 11, 13, (15, 27), (17, 10, 24), (19, 12, 25), (21, 14), (23, 18), (16, 20), 22
+ C Free: 26
+
+ adds r15, r15, r27
+
+ umulh r26, r2, r5
+ umulh r27, r2, r6
+
+ adcs r17, r17, r10
+ adcs r19, r19, r12
+
+ umulh r10, r2, r7
+ mul r12, r3, r4
+
+ adcs r21, r21, r14
+ adcs r23, r23, r18
+
+ mul r14, r3, r5
+ mul r18, r3, r6
+
+ adcs r16, r16, r20
+ cinc r22, r22, cs
+ C 9, 11, 13, 15, (17, 24), (19, 25, 12), (21, 26, 14), (23, 27, 18), (16, 10), 22
+ C Free: 20
+
+ adds r17, r17, r24
+
+ mul r20, r3, r7
+ mul r24, r3, r8
+
+ adcs r19, r19, r25
+ adcs r21, r21, r26
+
+ umulh r25, r3, r8
+ umulh r26, r3, r4
+
+ adcs r23, r23, r27
+ adcs r16, r16, r10
+
+ umulh r27, r3, r5
+ umulh r10, r3, r6
+
+ cinc r22, r22, cs
+ C 9, 11, 13, 15, 17, (19, 12), (21, 14, 26), (23, 18, 27), (16, 20, 10), (22, 24), 25
+
+ adds r19, r19, r12
+ adcs r21, r21, r14
+
+ umulh r12, r3, r7
+ mul r14, r4, r5
+
+ adcs r23, r23, r18
+ adcs r16, r16, r20
+
+ mul r18, r4, r6
+ mul r20, r4, r7
+
+ adcs r22, r22, r24
+ cinc r25, r25, cs
+ C 9, 11, 13, 15, 17, 19, (21, 26), (23, 27, 14), (16, 10, 18), (22, 12, 20), 25
+
+ adds r21, r21, r26
+
+ mul r24, r4, r8
+ umulh r26, r4, r8
+
+ adcs r23, r23, r27
+ adcs r16, r16, r10
+
+ umulh r27, r4, r5
+ umulh r10, r4, r6
+
+ adcs r22, r22, r12
+ cinc r25, r25, cs
+ C 9, 11, 13, 15, 17, 19, 21, (23, 14), (16, 18, 27), (22, 20, 10), (25, 24), 26
+
+ adds r23, r23, r14
+
+ umulh r12, r4, r7
+ umulh r14, r5, r7
+
+ adcs r16, r16, r18
+ adcs r22, r22, r20
+
+ umulh r18, r5, r8
+ mul r20, r5, r6
+
+ adcs r25, r25, r24
+ cinc r26, r26, cs
+ C 9, 11, 13, 15, 17, 19, 21, 23, (16, 27), (22, 10, 20), (25, 12), (26, 14), 18
+
+ adds r16, r16, r27
+
+ umulh r24, r5, r6
+ mul r27, r5, r8
+
+ adcs r22, r22, r10
+ adcs r25, r25, r12
+
+ umulh r10, r6, r7
+ umulh r12, r6, r8
+
+ adcs r26, r26, r14
+ cinc r18, r18, cs
+ C 9, 11, 13, 15, 17, 19, 21, 23, 16, (22, 20), (25, 24), (26, 27), (18, 10), 12
+
+ adds r22, r22, r20
+
+ mul r14, r5, r7
+ mul r20, r6, r7
+
+ adcs r25, r25, r24
+ adcs r26, r26, r27
+
+ mul r24, r6, r8
+ mul r27, r7, r8
+
+ adcs r18, r18, r10
+ cinc r12, r12, cs
+
+ umulh r10, r7, r8
+
+ adds r25, r25, r14
+
+ mul r14, r0, r1
+
+ adcs r26, r26, r20
+
+ mul r20, r0, r0
+ umulh r0, r0, r0
+
+ adcs r18, r18, r24
+ adcs r12, r12, r27
+
+ mul r24, r1, r1
+ umulh r1, r1, r1
+
+ cinc r10, r10, cs
+ C 14, 9, 11, 13, 15, 17, 19, 21, 23, 16, 22, 25, 26, 18, 12, 10
+ C 20, 0, 24, 1
+ C Free: 27
+
+ adds r14, r14, r14
+ adcs r9, r9, r9
+ adcs r11, r11, r11
+ adcs r13, r13, r13
+ adcs r15, r15, r15
+ adcs r17, r17, r17
+ adcs r19, r19, r19
+ adcs r21, r21, r21
+ lsr r27, r10, #63
+ adcs r23, r23, r23
+ adcs r16, r16, r16
+ adcs r22, r22, r22
+ adcs r25, r25, r25
+ adcs r26, r26, r26
+ adcs r18, r18, r18
+ adcs r12, r12, r12
+ adc r10, r10, r10
+ C 14, 9, 11, 13, 15, 17, 19, 21, 23, 16, 22, 25, 26, 18, 12, 10, 27
+ C 20, 0, 24, 1
+ C Free:
+
+ adds r0, r0, r14
+
+ mul r14, r2, r2
+ umulh r2, r2, r2
+
+ adcs r24, r24, r9
+ adcs r1, r1, r11
+
+ mul r9, r3, r3
+ umulh r3, r3, r3
+
+ stp r20, r0, [rp]
+
+ mul r11, r4, r4
+ umulh r4, r4, r4
+
+ stp r24, r1, [rp, #2*8]
+
+ adcs r14, r14, r13
+ adcs r2, r2, r15
+
+ mul r0, r5, r5
+ umulh r5, r5, r5
+
+ mul r1, r6, r6
+ umulh r6, r6, r6
+
+ adcs r9, r9, r17
+ adcs r3, r3, r19
+
+ stp r14, r2, [rp, #4*8]
+
+ mul r13, r7, r7
+ umulh r7, r7, r7
+
+ adcs r11, r11, r21
+ adcs r4, r4, r23
+
+ ldp r19, r20, [sp, #2*8]
+ stp r9, r3, [rp, #6*8]
+
+ mul r15, r8, r8
+ umulh r8, r8, r8
+
+ adcs r0, r0, r16
+ adcs r5, r5, r22
+
+ ldp r21, r22, [sp, #4*8]
+
+ adcs r1, r1, r25
+ adcs r6, r6, r26
+
+ stp r11, r4, [rp, #8*8]
+ ldp r23, r24, [sp, #6*8]
+
+ adcs r13, r13, r18
+ adcs r7, r7, r12
+
+ stp r0, r5, [rp, #10*8]
+ ldp r25, r26, [sp, #8*8]
+
+ adcs r15, r15, r10
+ adc r8, r8, r27
+
+ stp r1, r6, [rp, #12*8]
+ ldr r27, [sp, #10*8]
+ stp r13, r7, [rp, #14*8]
+ ldp r17, r18, [sp], #12*8
+ stp r15, r8, [rp, #16*8]
+
+ mov res, r8
+
+ ret
+EPILOGUE()
diff --git a/src/mpn_extras/arm64/sqrhigh_hard.asm b/src/mpn_extras/arm64/sqrhigh_hard.asm
new file mode 100644
index 0000000000..8e5275b286
--- /dev/null
+++ b/src/mpn_extras/arm64/sqrhigh_hard.asm
@@ -0,0 +1,528 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version. See .
+dnl
+
+include(`config.m4')
+
+define(`rp', `x0')
+define(`ap', `x1')
+
+define(`r0', `x2')
+define(`r1', `x3')
+define(`r2', `x4')
+define(`r3', `x5')
+define(`r4', `x6')
+define(`r5', `x7')
+define(`r6', `x8')
+define(`r7', `x9')
+define(`r8', `x10')
+define(`r9', `x11')
+define(`r10', `x12')
+define(`r11', `x13')
+define(`r12', `x14')
+define(`r13', `x15')
+define(`r14', `x16')
+define(`r15', `x17')
+
+define(`r16', `ap') dnl Beware!
+
+dnl NOTE: The following has to be pushed and popped
+dnl NOTE: At least on Apple, we need to preserve both x18 and x29 at all times!
+define(`r17', `x19')
+define(`r18', `x20')
+define(`r19', `x21')
+define(`r20', `x22')
+define(`r21', `x23')
+define(`r22', `x24')
+define(`r23', `x25')
+define(`r24', `x26')
+define(`r25', `x27')
+define(`r26', `x28')
+define(`r27', `x30')
+
+define(`res', `rp') dnl NOTE: Synonymous with rp!
+
+PROLOGUE(flint_mpn_sqrhigh_1)
+ ldr r0, [ap]
+ umulh r1, r0, r0
+ mul r0, r0, r0
+ str r1, [rp]
+ mov res, r0
+ ret
+EPILOGUE()
+
+PROLOGUE(flint_mpn_sqrhigh_2)
+ ldp r0, r1, [ap]
+ umulh r2, r0, r0
+ mul r3, r0, r1
+ umulh r4, r0, r1
+ mul r5, r1, r1
+ umulh r6, r1, r1
+ adds r2, r2, r3
+ adcs r5, r5, r4
+ cinc r6, r6, cs
+ adds r2, r2, r3
+ adcs r5, r5, r4
+ cinc r6, r6, cs
+ stp r5, r6, [rp]
+ mov res, r2
+ ret
+EPILOGUE()
+
+C 0 1 2
+C 1 h x
+C 2 d x
+C 3 d
+PROLOGUE(flint_mpn_sqrhigh_3)
+ ldp r0, r1, [ap]
+ ldr r2, [ap, #2*8]
+ umulh r3, r0, r1
+ mul r4, r0, r2
+ umulh r0, r0, r2
+ mul r5, r1, r2
+ umulh r6, r1, r2
+ mul r7, r1, r1
+ adds r3, r3, r4
+ umulh r1, r1, r1
+ mul r8, r2, r2
+ adcs r0, r0, r5
+ umulh r2, r2, r2
+ cinc r6, r6, cs
+ adds r3, r3, r3
+ adcs r0, r0, r0
+ adcs r6, r6, r6
+ cset r4, cs
+ adds r7, r7, r3
+ adcs r1, r1, r0
+ adcs r8, r8, r6
+ adc r2, r2, r4
+ stp r1, r8, [rp]
+ str r2, [rp, #2*8]
+ mov res, r7
+ ret
+EPILOGUE()
+
+C 0 1 2 3
+C 0 h x
+C 1 e x x
+C 2 d x
+C 3 d
+PROLOGUE(flint_mpn_sqrhigh_4)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ umulh r4, r0, r2
+ mul r5, r0, r3
+ umulh r6, r0, r3
+ umulh r7, r1, r2
+ umulh r8, r1, r3
+ mul r9, r1, r2
+ adds r4, r4, r5
+ mul r10, r1, r3
+ mul r11, r2, r3
+ adcs r6, r6, r7
+ umulh r12, r2, r3
+ umulh r1, r1, r1
+ cinc r8, r8, cs
+ mul r13, r2, r2
+ umulh r2, r2, r2
+ adds r4, r4, r9
+ mul r14, r3, r3
+ umulh r3, r3, r3
+ adcs r6, r6, r10
+ adcs r8, r8, r11
+ cinc r12, r12, cs
+ adds r4, r4, r4
+ adcs r6, r6, r6
+ adcs r8, r8, r8
+ adcs r12, r12, r12
+ cset r0, cs
+ adds r4, r4, r1
+ adcs r6, r6, r13
+ adcs r8, r8, r2
+ adcs r12, r12, r14
+ adc r0, r0, r3
+ stp r6, r8, [rp]
+ stp r12, r0, [rp, #2*8]
+ mov res, r4
+
+ ret
+EPILOGUE()
+
+C 0 1 2 3 4
+C 0 h x
+C 1 h x x
+C 2 d x x
+C 3 d x
+C 4 d
+PROLOGUE(flint_mpn_sqrhigh_5)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldr r4, [ap, #4*8]
+
+ umulh r5, r0, r3
+ mul r6, r0, r4
+ umulh r7, r0, r4
+ umulh r8, r1, r3
+ umulh r9, r1, r4
+ umulh r10, r1, r2
+ adds r5, r5, r6
+ mul r11, r1, r4
+ umulh r12, r2, r3
+ adcs r7, r7, r8
+ umulh r13, r2, r4
+ mul r14, r1, r3
+ cinc r9, r9, cs
+ mul r15, r2, r3
+ mul r16, r2, r4
+ adds r5, r5, r10
+ adcs r7, r7, r11
+ mul r6, r3, r4
+ umulh r8, r3, r4
+ adcs r9, r9, r12
+ cinc r13, r13, cs
+ mul r0, r2, r2
+ umulh r2, r2, r2
+ adds r5, r5, r14
+ adcs r7, r7, r15
+ mul r1, r3, r3
+ umulh r3, r3, r3
+ adcs r9, r9, r16
+ adcs r13, r13, r6
+ mul r10, r4, r4
+ umulh r4, r4, r4
+ cinc r8, r8, cs
+ adds r5, r5, r5
+ adcs r7, r7, r7
+ adcs r9, r9, r9
+ adcs r13, r13, r13
+ adcs r8, r8, r8
+ cset r11, cs
+ adds r5, r5, r0
+ adcs r7, r7, r2
+ adcs r9, r9, r1
+ adcs r13, r13, r3
+ stp r7, r9, [rp]
+ adcs r8, r8, r10
+ stp r13, r8, [rp, #2*8]
+ adc r11, r11, r4
+ str r11, [rp, #4*8]
+ mov res, r5
+
+ ret
+EPILOGUE()
+
+C 0 1 2 3 4 5
+C 0 h x
+C 1 h x x
+C 2 e x x x
+C 3 d x x
+C 4 d x
+C 5 d
+PROLOGUE(flint_mpn_sqrhigh_6)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+ umulh r6, r1, r3
+ mul r7, r2, r3
+ umulh r8, r2, r3
+ umulh r9, r1, r4
+ umulh r10, r2, r4
+ umulh r11, r0, r4
+ adds r6, r6, r7
+ mul r12, r2, r4
+ mul r13, r3, r4
+ adcs r8, r8, r9
+ umulh r14, r3, r4
+ mul r15, r1, r4
+ cinc r10, r10, cs
+ umulh r16, r0, r5
+ adds r6, r6, r11
+ umulh r7, r1, r5
+ adcs r8, r8, r12
+ umulh r9, r2, r5
+ adcs r10, r10, r13
+ umulh r11, r3, r5
+ cinc r14, r14, cs
+ mul r0, r0, r5
+ adds r6, r6, r15
+ mul r1, r1, r5
+ adcs r8, r8, r16
+ mul r12, r2, r5
+ adcs r10, r10, r7
+ mul r13, r3, r5
+ adcs r14, r14, r9
+ mul r15, r4, r5
+ cinc r11, r11, cs
+ adds r6, r6, r0
+ umulh r16, r4, r5
+ adcs r8, r8, r1
+ umulh r2, r2, r2
+ adcs r10, r10, r12
+ mul r7, r3, r3
+ adcs r14, r14, r13
+ umulh r3, r3, r3
+ adcs r11, r11, r15
+ mul r9, r4, r4
+ cinc r16, r16, cs
+ umulh r4, r4, r4
+ adds r6, r6, r6
+ adcs r8, r8, r8
+ mul r0, r5, r5
+ adcs r10, r10, r10
+ adcs r14, r14, r14
+ umulh r5, r5, r5
+ adcs r11, r11, r11
+ adcs r16, r16, r16
+ cset r1, cs
+ adds r2, r2, r6
+ adcs r7, r7, r8
+ adcs r3, r3, r10
+ stp r7, r3, [rp]
+ adcs r9, r9, r14
+ adcs r4, r4, r11
+ stp r9, r4, [rp, #2*8]
+ adcs r0, r0, r16
+ adc r5, r5, r1
+ stp r0, r5, [rp, #4*8]
+ mov res, r2
+
+ ret
+EPILOGUE()
+
+C 0 1 2 3 4 5 6
+C 0 h x
+C 1 h x x
+C 2 h x x x
+C 3 d x x x
+C 4 d x x
+C 5 d x
+C 6 d
+PROLOGUE(flint_mpn_sqrhigh_7)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+ ldr r6, [ap, #6*8]
+
+ umulh r7, r2, r3
+
+ umulh r8, r1, r4
+ umulh r9, r2, r4
+ mul r10, r3, r4
+ umulh r11, r3, r4
+ mul r12, r2, r4
+
+ adds r7, r7, r8
+ umulh r13, r1, r5
+ umulh r14, r2, r5
+ adcs r9, r9, r10
+ umulh r15, r3, r5
+ cinc r11, r11, cs
+ umulh r16, r0, r5
+ adds r7, r7, r12
+ mul r8, r2, r5
+ adcs r9, r9, r13
+ mul r10, r3, r5
+ adcs r11, r11, r14
+ mul r12, r4, r5
+ cinc r15, r15, cs
+ umulh r13, r4, r5
+
+ adds r7, r7, r16
+ mul r14, r1, r5
+
+ adcs r9, r9, r8
+ umulh r16, r0, r6
+ adcs r11, r11, r10
+ umulh r8, r1, r6
+ adcs r15, r15, r12
+ umulh r10, r2, r6
+ cinc r13, r13, cs
+ umulh r12, r3, r6
+ adds r7, r7, r14
+ umulh r14, r4, r6
+
+ adcs r9, r9, r16
+ mul r0, r0, r6
+ adcs r11, r11, r8
+ mul r1, r1, r6
+ adcs r15, r15, r10
+ mul r2, r2, r6
+ adcs r13, r13, r12
+ mul r16, r3, r6
+ cinc r14, r14, cs
+ mul r8, r4, r6
+ adds r7, r7, r0
+ mul r10, r5, r6
+ adcs r9, r9, r1
+ umulh r12, r5, r6
+
+ mul r0, r3, r3
+ adcs r11, r11, r2
+ umulh r3, r3, r3
+ adcs r15, r15, r16
+ mul r1, r4, r4
+ adcs r13, r13, r8
+ umulh r4, r4, r4
+ adcs r14, r14, r10
+ mul r2, r5, r5
+ cinc r12, r12, cs
+ umulh r5, r5, r5
+ adds r7, r7, r7
+ adcs r9, r9, r9
+ mul r16, r6, r6
+ adcs r11, r11, r11
+ adcs r15, r15, r15
+ umulh r6, r6, r6
+ adcs r13, r13, r13
+ adcs r14, r14, r14
+ adcs r12, r12, r12
+ cset r8, cs
+
+ adds r0, r0, r7
+ adcs r3, r3, r9
+ adcs r1, r1, r11
+ stp r3, r1, [rp]
+ adcs r4, r4, r15
+ adcs r2, r2, r13
+ stp r4, r2, [rp, #2*8]
+ adcs r5, r5, r14
+ adcs r16, r16, r12
+ stp r5, r16, [rp, #4*8]
+ adc r6, r6, r8
+ str r6, [rp, #6*8]
+ mov res, r0
+
+ ret
+EPILOGUE()
+
+C 0 1 2 3 4 5 6 7
+C 0 h x
+C 1 h x x
+C 2 h x x x
+C 3 e x x x x
+C 4 d x x x
+C 5 d x x
+C 6 d x
+C 7 d
+PROLOGUE(flint_mpn_sqrhigh_8)
+ ldp r0, r1, [ap]
+ ldp r2, r3, [ap, #2*8]
+ ldp r4, r5, [ap, #4*8]
+ ldp r6, r7, [ap, #6*8]
+
+ umulh r8, r2, r4
+ mul r9, r3, r4
+ umulh r10, r3, r4
+
+ umulh r11, r2, r5
+ umulh r12, r3, r5
+ umulh r13, r1, r5
+ adds r8, r8, r9
+ mul r14, r3, r5
+ mul r15, r4, r5
+ adcs r10, r10, r11
+ umulh r16, r4, r5
+ cinc r12, r12, cs
+ mul r9, r2, r5
+
+ adds r8, r8, r13
+ umulh r11, r1, r6
+ adcs r10, r10, r14
+ umulh r13, r2, r6
+ adcs r12, r12, r15
+ umulh r14, r3, r6
+ cinc r16, r16, cs
+ umulh r15, r4, r6
+ adds r8, r8, r9
+ mul r9, r1, r6
+ adcs r10, r10, r11
+ mul r11, r2, r6
+ adcs r12, r12, r13
+ mul r13, r3, r6
+ adcs r16, r16, r14
+ mul r14, r4, r6
+ cinc r15, r15, cs
+ adds r8, r8, r9
+ mul r9, r5, r6
+ adcs r10, r10, r11
+ umulh r11, r5, r6
+ adcs r12, r12, r13
+ umulh r13, r0, r6
+
+ adcs r16, r16, r14
+ umulh r14, r0, r7
+ adcs r15, r15, r9
+ umulh r9, r1, r7
+ cinc r11, r11, cs
+ mul r0, r0, r7 C Reduce latency
+ adds r8, r8, r13
+ umulh r13, r2, r7
+ adcs r10, r10, r14
+ umulh r14, r3, r7
+ adcs r12, r12, r9
+ umulh r9, r4, r7
+ adcs r16, r16, r13
+ umulh r13, r5, r7
+ mul r1, r1, r7
+ adcs r15, r15, r14
+ mul r2, r2, r7
+ adcs r11, r11, r9
+ mul r14, r3, r7
+ cinc r13, r13, cs
+ mul r9, r4, r7
+ adds r8, r8, r0
+ mul r0, r5, r7
+ adcs r10, r10, r1
+ mul r1, r6, r7
+ adcs r12, r12, r2
+ umulh r2, r6, r7
+
+ adcs r16, r16, r14
+ umulh r3, r3, r3
+ adcs r15, r15, r9
+ mul r14, r4, r4
+ adcs r11, r11, r0
+ umulh r4, r4, r4
+ adcs r13, r13, r1
+ mul r9, r5, r5
+ cinc r2, r2, cs
+ umulh r5, r5, r5
+
+ adds r8, r8, r8
+ mul r0, r6, r6
+ adcs r10, r10, r10
+ umulh r6, r6, r6
+ adcs r12, r12, r12
+ adcs r16, r16, r16
+ adcs r15, r15, r15
+ adcs r11, r11, r11
+ adcs r13, r13, r13
+ adcs r2, r2, r2
+ cset r1, cs
+
+ adds r3, r3, r8
+ mul r8, r7, r7
+ adcs r14, r14, r10
+ umulh r7, r7, r7
+ adcs r4, r4, r12
+ stp r14, r4, [rp]
+ adcs r9, r9, r16
+ adcs r5, r5, r15
+ stp r9, r5, [rp, #2*8]
+ adcs r0, r0, r11
+ adcs r6, r6, r13
+ stp r0, r6, [rp, #4*8]
+ adcs r8, r8, r2
+ adc r7, r7, r1
+ stp r8, r7, [rp, #6*8]
+ mov res, r3
+
+ ret
+EPILOGUE()
diff --git a/src/mpn_extras/mulhigh_basecase.c b/src/mpn_extras/mulhigh_basecase.c
index f38249e1a9..0f4adc0c3c 100644
--- a/src/mpn_extras/mulhigh_basecase.c
+++ b/src/mpn_extras/mulhigh_basecase.c
@@ -82,7 +82,55 @@ const flint_mpn_sqr_func_t flint_mpn_sqrhigh_func_tab[] =
flint_mpn_sqrhigh_7,
flint_mpn_sqrhigh_8
};
+#elif FLINT_HAVE_ASSEMBLY_armv8
+mp_limb_t flint_mpn_mulhigh_1(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mulhigh_2(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mulhigh_3(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mulhigh_4(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mulhigh_5(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mulhigh_6(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mulhigh_7(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mulhigh_8(mp_ptr, mp_srcptr, mp_srcptr);
+
+mp_limb_t flint_mpn_sqrhigh_1(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqrhigh_2(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqrhigh_3(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqrhigh_4(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqrhigh_5(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqrhigh_6(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqrhigh_7(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqrhigh_8(mp_ptr, mp_srcptr);
+
+const flint_mpn_mul_func_t flint_mpn_mulhigh_func_tab[] =
+{
+ NULL,
+ flint_mpn_mulhigh_1,
+ flint_mpn_mulhigh_2,
+ flint_mpn_mulhigh_3,
+ flint_mpn_mulhigh_4,
+ flint_mpn_mulhigh_5,
+ flint_mpn_mulhigh_6,
+ flint_mpn_mulhigh_7,
+ flint_mpn_mulhigh_8,
+};
+
+const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_normalised_func_tab[] =
+{
+ NULL,
+};
+const flint_mpn_sqr_func_t flint_mpn_sqrhigh_func_tab[] =
+{
+ NULL,
+ flint_mpn_sqrhigh_1,
+ flint_mpn_sqrhigh_2,
+ flint_mpn_sqrhigh_3,
+ flint_mpn_sqrhigh_4,
+ flint_mpn_sqrhigh_5,
+ flint_mpn_sqrhigh_6,
+ flint_mpn_sqrhigh_7,
+ flint_mpn_sqrhigh_8,
+};
#else
/* todo: add MPFR-like basecase for use in mulders */
diff --git a/src/mpn_extras/profile/p-mul.c b/src/mpn_extras/profile/p-mul.c
index c0b29a47c9..2cdf461e75 100644
--- a/src/mpn_extras/profile/p-mul.c
+++ b/src/mpn_extras/profile/p-mul.c
@@ -16,24 +16,42 @@
#define N_MAX 16
#define N_MAX2 256
-static void measure(mp_ptr rpg, mp_ptr rpf, mp_ptr xp, mp_ptr yp, slong m)
+static void measure(mp_ptr rpg, mp_ptr rpf, mp_ptr xp, mp_ptr yp, slong m, slong n_max)
{
double t1, t2, FLINT_SET_BUT_UNUSED(__);
slong n;
flint_printf("m = %3wd:", m);
- for (n = 1; n <= m; n = (n <= N_MAX ? n + 1 : n * 1.2))
+ for (n = 1; n <= n_max; n = (n <= N_MAX ? n + 1 : n * 1.2))
{
- mpn_random2(xp, m);
- mpn_random2(yp, n);
+ mp_ptr ap, bp;
+ slong mt, nt;
+
+ if (m >= n)
+ {
+ mt = m;
+ nt = n;
+ ap = xp;
+ bp = yp;
+ }
+ else
+ {
+ mt = n;
+ nt = m;
+ ap = yp;
+ bp = xp;
+ }
+
+ mpn_random2(ap, mt);
+ mpn_random2(bp, nt);
TIMEIT_START
- mpn_mul(rpg, xp, m, yp, n);
+ mpn_mul(rpg, ap, mt, bp, nt);
TIMEIT_STOP_VALUES(__, t1)
TIMEIT_START
- flint_mpn_mul(rpf, xp, m, yp, n);
+ flint_mpn_mul(rpf, ap, mt, bp, nt);
TIMEIT_STOP_VALUES(__, t2)
flint_printf(" %.2f", t1 / t2);
@@ -44,21 +62,42 @@ static void measure(mp_ptr rpg, mp_ptr rpf, mp_ptr xp, mp_ptr yp, slong m)
int main(int argc, char ** argv)
{
- mp_limb_t xp[N_MAX2], yp[N_MAX2], rpf[2 * N_MAX2], rpg[2 * N_MAX2];
+ mp_ptr xp, yp, rpf, rpg;
slong m;
+ xp = flint_malloc(sizeof(mp_limb_t) * N_MAX2);
+ yp = flint_malloc(sizeof(mp_limb_t) * N_MAX2);
+ rpf = flint_malloc(2 * sizeof(mp_limb_t) * N_MAX2);
+ rpg = flint_malloc(2 * sizeof(mp_limb_t) * N_MAX2);
+
if (argc == 2)
{
- measure(rpg, rpf, xp, yp, strtol(argv[1], NULL, 10));
+ measure(rpg, rpf, xp, yp, strtol(argv[1], NULL, 10), N_MAX2);
+ goto end;
+ }
+ else if (argc == 3)
+ {
+ slong m = strtol(argv[1], NULL, 10);
+ slong n_max = strtol(argv[2], NULL, 10);
+ xp = flint_realloc(xp, sizeof(mp_limb_t) * m);
+ yp = flint_realloc(yp, sizeof(mp_limb_t) * n_max);
+ rpf = flint_realloc(rpf, sizeof(mp_limb_t) * (m + n_max));
+ rpg = flint_realloc(rpg, sizeof(mp_limb_t) * (m + n_max));
+ measure(rpg, rpf, xp, yp, m, n_max);
goto end;
}
flint_printf("mpn_mul vs flint_mpn_mul\n\n");
for (m = 1; m <= N_MAX2; m = (m <= N_MAX ? m + 1 : m * 1.2))
- measure(rpg, rpf, xp, yp, m);
+ measure(rpg, rpf, xp, yp, m, m);
end:
+ flint_free(xp);
+ flint_free(yp);
+ flint_free(rpf);
+ flint_free(rpg);
+
flint_cleanup_master();
return 0;
}
diff --git a/src/mpn_extras/sqr_basecase.c b/src/mpn_extras/sqr_basecase.c
index 8591e28598..5feab800a4 100644
--- a/src/mpn_extras/sqr_basecase.c
+++ b/src/mpn_extras/sqr_basecase.c
@@ -99,6 +99,30 @@ const flint_mpn_sqr_func_t flint_mpn_sqr_func_tab[] = {
flint_mpn_sqr_14
};
+#elif FLINT_HAVE_ASSEMBLY_armv8
+mp_limb_t flint_mpn_sqr_1(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqr_2(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqr_3(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqr_4(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqr_5(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqr_6(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqr_7(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqr_8(mp_ptr, mp_srcptr);
+mp_limb_t flint_mpn_sqr_9(mp_ptr, mp_srcptr);
+
+const flint_mpn_sqr_func_t flint_mpn_sqr_func_tab[] =
+{
+ NULL,
+ flint_mpn_sqr_1,
+ flint_mpn_sqr_2,
+ flint_mpn_sqr_3,
+ flint_mpn_sqr_4,
+ flint_mpn_sqr_5,
+ flint_mpn_sqr_6,
+ flint_mpn_sqr_7,
+ flint_mpn_sqr_8,
+ flint_mpn_sqr_9
+};
#else
/* Currently generic C code performs worse than GMP. */
diff --git a/src/mpn_extras/sqrhigh.c b/src/mpn_extras/sqrhigh.c
index baeb944f40..e6d650e424 100644
--- a/src/mpn_extras/sqrhigh.c
+++ b/src/mpn_extras/sqrhigh.c
@@ -17,8 +17,6 @@
#include "mpn_extras.h"
-#if FLINT_HAVE_NATIVE_mpn_mulhigh_basecase
-
/* Generated by tune-sqrhigh.c */
static const signed short flint_mpn_sqrhigh_k_tab[FLINT_MPN_SQRHIGH_K_TAB_SIZE] =
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -167,16 +165,6 @@ _flint_mpn_sqrhigh_sqr(mp_ptr res, mp_srcptr u, mp_size_t n)
return bot;
}
-mp_limb_t flint_mpn_sqrhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_size_t n)
-{
- FLINT_ASSERT(n >= 1);
-
- if (FLINT_HAVE_SQRHIGH_FUNC(n)) /* NOTE: Aliasing allowed here */
- return flint_mpn_sqrhigh_func_tab[n](rp, xp);
- else
- return _flint_mpn_sqrhigh_basecase(rp, xp, n);
-}
-
mp_limb_t
_flint_mpn_sqrhigh(mp_ptr res, mp_srcptr u, mp_size_t n)
{
@@ -187,7 +175,3 @@ _flint_mpn_sqrhigh(mp_ptr res, mp_srcptr u, mp_size_t n)
else
return _flint_mpn_sqrhigh_sqr(res, u, n);
}
-
-#else
-typedef int this_file_is_empty;
-#endif
diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c
index e28b927bd1..999280ff88 100644
--- a/src/mpn_extras/test/main.c
+++ b/src/mpn_extras/test/main.c
@@ -25,14 +25,14 @@
#include "t-mul_n.c"
#include "t-mul_toom22.c"
#include "t-mullow_n.c"
-#include "t-mulhigh.c"
+#include "t-mulhigh_n.c"
#include "t-mulhigh_normalised.c"
#include "t-mulmod_2expp1.c"
#include "t-mulmod_preinv1.c"
#include "t-mulmod_preinvn.c"
#include "t-remove_2exp.c"
#include "t-remove_power.c"
-#include "t-sqrhigh_basecase.c"
+#include "t-sqr.c"
#include "t-sqrhigh.c"
/* Array of test functions ***************************************************/
@@ -50,14 +50,14 @@ test_struct tests[] =
TEST_FUNCTION(flint_mpn_mul_n),
TEST_FUNCTION(flint_mpn_mul_toom22),
TEST_FUNCTION(flint_mpn_mullow_n),
- TEST_FUNCTION(flint_mpn_mulhigh),
+ TEST_FUNCTION(flint_mpn_mulhigh_n),
TEST_FUNCTION(flint_mpn_mulhigh_normalised),
TEST_FUNCTION(flint_mpn_mulmod_2expp1),
TEST_FUNCTION(flint_mpn_mulmod_preinv1),
TEST_FUNCTION(flint_mpn_mulmod_preinvn),
TEST_FUNCTION(flint_mpn_remove_2exp),
TEST_FUNCTION(flint_mpn_remove_power),
- TEST_FUNCTION(flint_mpn_sqrhigh_basecase),
+ TEST_FUNCTION(flint_mpn_sqr),
TEST_FUNCTION(flint_mpn_sqrhigh)
};
diff --git a/src/mpn_extras/test/t-mul.c b/src/mpn_extras/test/t-mul.c
index e3187d7b48..f1981cce47 100644
--- a/src/mpn_extras/test/t-mul.c
+++ b/src/mpn_extras/test/t-mul.c
@@ -72,3 +72,6 @@ TEST_FUNCTION_START(flint_mpn_mul, state)
#undef N_MIN
#undef N_MAX
+#undef WANT_BIG
+#undef N_MIN_STOR
+#undef N_MAX_STOR
diff --git a/src/mpn_extras/test/t-mulhigh.c b/src/mpn_extras/test/t-mulhigh.c
deleted file mode 100644
index fba5b702bd..0000000000
--- a/src/mpn_extras/test/t-mulhigh.c
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- Copyright (C) 2024 Albin Ahlbäck
-
- This file is part of FLINT.
-
- FLINT is free software: you can redistribute it and/or modify it under
- the terms of the GNU Lesser General Public License (LGPL) as published
- by the Free Software Foundation; either version 3 of the License, or
- (at your option) any later version. See .
-*/
-
-#include "test_helpers.h"
-#include "mpn_extras.h"
-
-/* TODO: Remove this preprocessor conditional */
-#if FLINT_HAVE_NATIVE_mpn_mulhigh_basecase
-
-#define N_MAX (FLINT_MPN_MULHIGH_MUL_CUTOFF + 100)
-
-#define rpcH (rpc + n - 1)
-
-/* Defined in t-mulhigh.c and t-sqrhigh.c */
-#ifndef lower_bound
-# define lower_bound lower_bound
-static ulong lower_bound(ulong n)
-{
- /* These are calculated by hand lower bound for the returned limb */
- if (n < 3)
- return 0;
- else
- return 4 * n - 8;
-}
-#endif
-
-TEST_FUNCTION_START(flint_mpn_mulhigh, state)
-{
- slong ix;
- int result;
-
- mp_ptr rp, rpc, xp, yp;
-
- rp = flint_malloc(sizeof(mp_limb_t) * N_MAX);
- rpc = flint_malloc(2 * sizeof(mp_limb_t) * N_MAX);
- xp = flint_malloc(sizeof(mp_limb_t) * N_MAX);
- yp = flint_malloc(sizeof(mp_limb_t) * N_MAX);
-
- for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++)
- {
- mp_limb_t borrow;
- mp_size_t n;
-
- /* Trigger full multiplication in mulhigh */
- if (n_randint(state, 1000) == 0)
- n = FLINT_MPN_MULHIGH_MUL_CUTOFF + n_randint(state, 50);
- else if (n_randint(state, 100) == 0)
- n = 1 + n_randint(state, FLINT_MPN_MULHIGH_MUL_CUTOFF);
- else
- n = 1 + n_randint(state, 2 * FLINT_MPN_MULHIGH_MULDERS_CUTOFF);
-
- mpn_random2(xp, n);
- mpn_random2(yp, n);
-
- rp[0] = flint_mpn_mulhigh_n(rp + 1, xp, yp, n);
-
- /* Check upper bound */
- flint_mpn_mul_n(rpc, xp, yp, n);
-
- result = (mpn_cmp(rp, rpcH, n + 1) <= 0);
- if (!result)
- TEST_FUNCTION_FAIL(
- "Bigger than upper bound!\n"
- "ix = %wd\n"
- "n = %wd\n"
- "xp = %{ulong*}\n"
- "yp = %{ulong*}\n"
- "Upper bound: %{ulong*}\n"
- "Got: %{ulong*}\n",
- ix, n, xp, n, yp, n, rpcH, n + 1, rp, n + 1);
-
- /* Check lower bound */
- borrow = mpn_sub_1(rpcH, rpcH, n + 1, lower_bound(n));
- if (borrow)
- mpn_zero(rpcH, n + 1);
-
- result = (mpn_cmp(rp, rpcH, n + 1) >= 0);
- if (!result)
- TEST_FUNCTION_FAIL(
- "Smaller than lower bound!\n"
- "ix = %wd\n"
- "n = %wd\n"
- "xp = %{ulong*}\n"
- "yp = %{ulong*}\n"
- "Lower bound: %{ulong*}\n"
- "Got: %{ulong*}\n",
- ix, n, xp, n, yp, n, rpcH, n + 1, rp, n + 1);
- }
-
- flint_free(rp);
- flint_free(rpc);
- flint_free(xp);
- flint_free(yp);
-
- TEST_FUNCTION_END(state);
-}
-# undef N_MIN
-# undef N_MAX
-# undef N_MAX2
-#else
-TEST_FUNCTION_START(flint_mpn_mulhigh, state)
-{
- TEST_FUNCTION_END_SKIPPED(state);
-}
-#endif
diff --git a/src/mpn_extras/test/t-mulhigh_n.c b/src/mpn_extras/test/t-mulhigh_n.c
new file mode 100644
index 0000000000..06a42763bd
--- /dev/null
+++ b/src/mpn_extras/test/t-mulhigh_n.c
@@ -0,0 +1,202 @@
+/*
+ Copyright (C) 2024 Albin Ahlbäck
+ Copyright (C) 2024 Fredrik Johansson
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "test_helpers.h"
+#include "mpn_extras.h"
+
+#define N_MIN 1
+
+#define WANT_STOR 1
+
+#if WANT_STOR
+# define MAX_ALLOC_SIZE (FLINT_MPN_MULHIGH_MUL_CUTOFF + 50)
+#else
+# define N_MAX FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH
+# define MAX_ALLOC_SIZE (2 * N_MAX)
+#endif
+
+#define GET_N_FULL_MUL(state) (N_MIN + FLINT_MPN_MULHIGH_MUL_CUTOFF + n_randint(state, 50 - N_MIN + 1))
+#define GET_N_LARGE(state) (N_MIN + n_randint(state, FLINT_MPN_MULHIGH_MUL_CUTOFF - N_MIN + 1))
+#define GET_N_SMALL(state) (N_MIN + n_randint(state, 2 * FLINT_MPN_MULHIGH_MULDERS_CUTOFF - N_MIN + 1))
+
+#define rpcH (rpc + n - 1)
+
+/* Defined in t-mulhigh.c and t-sqrhigh.c */
+#ifndef generic_mulhigh_basecase
+# define generic_mulhigh_basecase generic_mulhigh_basecase
+static mp_limb_t
+generic_mulhigh_basecase(mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+ mp_limb_t low;
+ mp_limb_t a, b;
+
+ if (n == 1)
+ {
+ umul_ppmm(rp[0], low, up[0], vp[0]);
+ }
+ else if (n == 2)
+ {
+ FLINT_MPN_MUL_2X2(rp[1], rp[0], low, b, up[1], up[0], vp[1], vp[0]);
+ }
+ else if (n == 3)
+ {
+ NN_DOTREV_S3_1X1_HIGH(b, a, up, vp, 2);
+ NN_DOTREV_S3_A3_1X1(b, a, low, 0, b, a, up, vp, 3);
+ NN_DOTREV_S3_A3_1X1(b, a, rp[0], 0, b, a, up + 1, vp + 1, 2);
+ NN_ADDMUL_S2_A2_1X1(rp[2], rp[1], b, a, up[2], vp[2]);
+ }
+ else
+ {
+ mp_limb_t tp[2];
+ slong ix;
+
+ NN_DOTREV_S3_1X1_HIGH(b, a, up, vp, n - 1);
+ NN_DOTREV_S3_A3_1X1(b, a, low, 0, b, a, up, vp, n);
+ tp[0] = a;
+ tp[1] = b;
+
+ umul_ppmm(rp[1], rp[0], up[n - 1], vp[1]);
+ for (ix = 2; ix < n; ix++)
+ rp[ix] = mpn_addmul_1(rp, up + n - ix, ix, vp[ix]);
+
+ mpn_add(rp, rp, n, tp, 2);
+ }
+
+ return low;
+}
+#endif
+
+/* Defined in t-mulhigh.c and t-sqrhigh.c */
+#ifndef lower_bound
+# define lower_bound lower_bound
+static ulong lower_bound(ulong n)
+{
+ /* These are calculated by hand lower bound for the returned limb */
+ if (n < 3)
+ return 0;
+ else
+ return 4 * n - 8;
+}
+#endif
+
+TEST_FUNCTION_START(flint_mpn_mulhigh_n, state)
+{
+ slong ix;
+ int result;
+ mp_ptr rp, rpc, xp, yp;
+
+ rpc = flint_malloc(2 * sizeof(mp_limb_t) * MAX_ALLOC_SIZE);
+
+ for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++)
+ {
+ mp_limb_t borrow;
+ mp_size_t n;
+
+#if WANT_STOR
+ /* Trigger full multiplication in mulhigh */
+ if (n_randint(state, 1000) == 0)
+ n = GET_N_FULL_MUL(state);
+ else if (n_randint(state, 100) == 0)
+ n = GET_N_LARGE(state);
+ else
+ n = GET_N_SMALL(state);
+#else
+ n = N_MIN + n_randint(state, N_MAX - N_MIN + 1);
+#endif
+
+ xp = flint_malloc(sizeof(mp_limb_t) * n);
+ yp = flint_malloc(sizeof(mp_limb_t) * n);
+
+ mpn_random2(xp, n);
+ mpn_random2(yp, n);
+
+ if (n <= FLINT_MAX(FLINT_MPN_MULHIGH_MULDERS_CUTOFF, FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH))
+ {
+ /* Check that it is coherent with generic version */
+ mp_limb_t ret0, ret1;
+
+ rp = flint_malloc(sizeof(mp_limb_t) * n);
+
+ ret0 = generic_mulhigh_basecase(rpc, xp, yp, n);
+ ret1 = flint_mpn_mulhigh_n(rp, xp, yp, n);
+
+ result = (mpn_cmp(rp, rpc, n) == 0 && ret0 == ret1);
+ if (!result)
+ TEST_FUNCTION_FAIL(
+ "Basecase not coherent with generic version.\n"
+ "ix = %wd\n"
+ "n = %wd\n"
+ "xp = %{ulong*}\n"
+ "yp = %{ulong*}\n"
+ "Expected ret: %{ulong}\n"
+ "Got ret: %{ulong}\n"
+ "Expected limbs: %{ulong*}\n"
+ "Got limbs: %{ulong*}\n",
+ ix, n, xp, n, yp, n, ret0, ret1, rpc, n, rp, n);
+ }
+ else
+ {
+ /* Check that it is coherent with bounds */
+ rp = flint_malloc(sizeof(mp_limb_t) * (n + 1));
+
+ rp[0] = flint_mpn_mulhigh_n(rp + 1, xp, yp, n);
+ flint_mpn_mul_n(rpc, xp, yp, n);
+
+ result = (mpn_cmp(rp, rpcH, n + 1) <= 0);
+ if (!result)
+ TEST_FUNCTION_FAIL(
+ "Bigger than upper bound!\n"
+ "ix = %wd\n"
+ "n = %wd\n"
+ "xp = %{ulong*}\n"
+ "yp = %{ulong*}\n"
+ "Upper bound: %{ulong*}\n"
+ "Got: %{ulong*}\n",
+ ix, n, xp, n, yp, n, rpcH, n + 1, rp, n + 1);
+
+ /* Check lower bound */
+ borrow = mpn_sub_1(rpcH, rpcH, n + 1, lower_bound(n));
+ if (borrow)
+ mpn_zero(rpcH, n + 1);
+
+ result = (mpn_cmp(rp, rpcH, n + 1) >= 0);
+ if (!result)
+ TEST_FUNCTION_FAIL(
+ "Smaller than lower bound!\n"
+ "ix = %wd\n"
+ "n = %wd\n"
+ "xp = %{ulong*}\n"
+ "yp = %{ulong*}\n"
+ "Lower bound: %{ulong*}\n"
+ "Got: %{ulong*}\n",
+ ix, n, xp, n, yp, n, rpcH, n + 1, rp, n + 1);
+ }
+
+ flint_free(xp);
+ flint_free(yp);
+ flint_free(rp);
+ }
+
+ flint_free(rpc);
+
+ TEST_FUNCTION_END(state);
+}
+
+#undef N_MIN
+#undef N_MAX
+#undef WANT_STOR
+#undef rpcH
+#undef MAX_ALLOC_SIZE
+
+#undef GET_N_FULL_MUL
+#undef GET_N_LARGE
+#undef GET_N_SMALL
diff --git a/src/mpn_extras/test/t-sqr.c b/src/mpn_extras/test/t-sqr.c
index 22efdd33ac..0b7c321778 100644
--- a/src/mpn_extras/test/t-sqr.c
+++ b/src/mpn_extras/test/t-sqr.c
@@ -50,9 +50,9 @@ TEST_FUNCTION_START(flint_mpn_sqr, state)
TEST_FUNCTION_FAIL(
"n = %wd\n"
"xp = %{ulong*}\n"
- "rp1 = %{ulong*}\n"
- "rp2 = %{ulong*}\n",
- n, xp, n, rp1, 2 * n, rp2, 2 * n);
+ "Expected: %{ulong*}\n"
+ "Got: %{ulong*}\n",
+ n, xp, n, rp2, 2 * n, rp1, 2 * n);
flint_free(xp);
flint_free(rp1);
@@ -61,3 +61,9 @@ TEST_FUNCTION_START(flint_mpn_sqr, state)
TEST_FUNCTION_END(state);
}
+
+#undef N_MIN
+#undef N_MAX
+#undef WANT_BIG
+#undef N_MIN_STOR
+#undef N_MAX_STOR
diff --git a/src/mpn_extras/test/t-sqrhigh.c b/src/mpn_extras/test/t-sqrhigh.c
index 6ff0db91b8..ac5ec759bd 100644
--- a/src/mpn_extras/test/t-sqrhigh.c
+++ b/src/mpn_extras/test/t-sqrhigh.c
@@ -12,13 +12,68 @@
#include "test_helpers.h"
#include "mpn_extras.h"
-/* TODO: Remove this preprocessor conditional */
-#if FLINT_HAVE_NATIVE_mpn_mulhigh_basecase
+#define N_MIN 1
-#define N_MAX FLINT_MAX(FLINT_MPN_SQRHIGH_SQR_CUTOFF + 50, 2 * FLINT_MPN_SQRHIGH_MULDERS_CUTOFF)
+#define WANT_STOR 1
+
+#if WANT_STOR
+# define MAX_ALLOC_SIZE (FLINT_MPN_SQRHIGH_SQR_CUTOFF + 50)
+#else
+# define N_MAX FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH
+# define MAX_ALLOC_SIZE (2 * N_MAX)
+#endif
+
+#define GET_N_FULL_SQR(state) (N_MIN + FLINT_MPN_SQRHIGH_SQR_CUTOFF + n_randint(state, 50 - N_MIN + 1))
+#define GET_N_LARGE(state) (N_MIN + n_randint(state, FLINT_MPN_SQRHIGH_SQR_CUTOFF - N_MIN + 1))
+#define GET_N_SMALL(state) (N_MIN + n_randint(state, 2 * FLINT_MPN_SQRHIGH_MULDERS_CUTOFF - N_MIN + 1))
#define rpcH (rpc + n - 1)
+/* Defined in t-mulhigh.c and t-sqrhigh.c */
+#ifndef generic_mulhigh_basecase
+# define generic_mulhigh_basecase generic_mulhigh_basecase
+static mp_limb_t
+generic_mulhigh_basecase(mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n)
+{
+ mp_limb_t low;
+ mp_limb_t a, b;
+
+ if (n == 1)
+ {
+ umul_ppmm(rp[0], low, up[0], vp[0]);
+ }
+ else if (n == 2)
+ {
+ FLINT_MPN_MUL_2X2(rp[1], rp[0], low, b, up[1], up[0], vp[1], vp[0]);
+ }
+ else if (n == 3)
+ {
+ NN_DOTREV_S3_1X1_HIGH(b, a, up, vp, 2);
+ NN_DOTREV_S3_A3_1X1(b, a, low, 0, b, a, up, vp, 3);
+ NN_DOTREV_S3_A3_1X1(b, a, rp[0], 0, b, a, up + 1, vp + 1, 2);
+ NN_ADDMUL_S2_A2_1X1(rp[2], rp[1], b, a, up[2], vp[2]);
+ }
+ else
+ {
+ mp_limb_t tp[2];
+ slong ix;
+
+ NN_DOTREV_S3_1X1_HIGH(b, a, up, vp, n - 1);
+ NN_DOTREV_S3_A3_1X1(b, a, low, 0, b, a, up, vp, n);
+ tp[0] = a;
+ tp[1] = b;
+
+ umul_ppmm(rp[1], rp[0], up[n - 1], vp[1]);
+ for (ix = 2; ix < n; ix++)
+ rp[ix] = mpn_addmul_1(rp, up + n - ix, ix, vp[ix]);
+
+ mpn_add(rp, rp, n, tp, 2);
+ }
+
+ return low;
+}
+#endif
+
/* Defined in t-mulhigh.c and t-sqrhigh.c */
#ifndef lower_bound
# define lower_bound lower_bound
@@ -36,73 +91,105 @@ TEST_FUNCTION_START(flint_mpn_sqrhigh, state)
{
slong ix;
int result;
-
mp_ptr rp, rpc, xp;
- rp = flint_malloc(sizeof(mp_limb_t) * (N_MAX + 1));
- rpc = flint_malloc(2 * sizeof(mp_limb_t) * N_MAX);
- xp = flint_malloc(sizeof(mp_limb_t) * N_MAX);
+ rpc = flint_malloc(2 * sizeof(mp_limb_t) * MAX_ALLOC_SIZE);
for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++)
{
mp_limb_t borrow;
mp_size_t n;
+
+#if WANT_STOR
/* Trigger full multiplication in mulhigh */
if (n_randint(state, 1000) == 0)
- n = 1 + FLINT_MPN_SQRHIGH_SQR_CUTOFF + n_randint(state, 50);
+ n = GET_N_FULL_SQR(state);
else if (n_randint(state, 100) == 0)
- n = 1 + n_randint(state, FLINT_MPN_SQRHIGH_SQR_CUTOFF);
+ n = GET_N_LARGE(state);
else
- n = 1 + n_randint(state, 2 * FLINT_MPN_SQRHIGH_MULDERS_CUTOFF);
+ n = GET_N_SMALL(state);
+#else
+ n = N_MIN + n_randint(state, N_MAX - N_MIN + 1);
+#endif
+
+ xp = flint_malloc(sizeof(mp_limb_t) * n);
mpn_random2(xp, n);
- rp[0] = flint_mpn_sqrhigh(rp + 1, xp, n);
-
- /* Check upper bound */
- flint_mpn_sqr(rpc, xp, n);
-
- result = (mpn_cmp(rp, rpcH, n + 1) <= 0);
- if (!result)
- TEST_FUNCTION_FAIL(
- "Bigger than upper bound!\n"
- "ix = %wd\n"
- "n = %wd\n"
- "xp = %{ulong*}\n"
- "yp = %{ulong*}\n"
- "Upper bound: %{ulong*}\n"
- "Got: %{ulong*}\n",
- ix, n, xp, n, rpcH, n + 1, rp, n + 1);
-
- /* Check lower bound */
- borrow = mpn_sub_1(rpcH, rpcH, n + 1, lower_bound(n));
- if (borrow)
- mpn_zero(rpcH, n + 1);
-
- result = (mpn_cmp(rp, rpcH, n + 1) >= 0);
- if (!result)
- TEST_FUNCTION_FAIL(
- "Smaller than lower bound!\n"
- "ix = %wd\n"
- "n = %wd\n"
- "xp = %{ulong*}\n"
- "Lower bound: %{ulong*}\n"
- "Got: %{ulong*}\n",
- ix, n, xp, n, rpcH, n + 1, rp, n + 1);
+ if (n <= FLINT_MAX(FLINT_MPN_SQRHIGH_MULDERS_CUTOFF, FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH))
+ {
+ /* Check that it is coherent with generic version */
+ mp_limb_t ret0, ret1;
+
+ rp = flint_malloc(sizeof(mp_limb_t) * n);
+
+ ret0 = generic_mulhigh_basecase(rpc, xp, xp, n);
+ ret1 = flint_mpn_sqrhigh(rp, xp, n);
+
+ result = (mpn_cmp(rp, rpc, n) == 0 && ret0 == ret1);
+ if (!result)
+ TEST_FUNCTION_FAIL(
+ "Basecase not coherent with generic version.\n"
+ "ix = %wd\n"
+ "n = %wd\n"
+ "xp = %{ulong*}\n"
+ "Expected ret: %{ulong}\n"
+ "Got ret: %{ulong}\n"
+ "Expected limbs: %{ulong*}\n"
+ "Got limbs: %{ulong*}\n",
+ ix, n, xp, n, ret0, ret1, rpc, n, rp, n);
+ }
+ else
+ {
+ /* Check that it is coherent with bounds */
+ rp = flint_malloc(sizeof(mp_limb_t) * (n + 1));
+
+ rp[0] = flint_mpn_sqrhigh(rp + 1, xp, n);
+ flint_mpn_sqr(rpc, xp, n);
+
+ result = (mpn_cmp(rp, rpcH, n + 1) <= 0);
+ if (!result)
+ TEST_FUNCTION_FAIL(
+ "Bigger than upper bound!\n"
+ "ix = %wd\n"
+ "n = %wd\n"
+ "xp = %{ulong*}\n"
+ "Upper bound: %{ulong*}\n"
+ "Got: %{ulong*}\n",
+ ix, n, xp, n, rpcH, n + 1, rp, n + 1);
+
+ /* Check lower bound */
+ borrow = mpn_sub_1(rpcH, rpcH, n + 1, lower_bound(n));
+ if (borrow)
+ mpn_zero(rpcH, n + 1);
+
+ result = (mpn_cmp(rp, rpcH, n + 1) >= 0);
+ if (!result)
+ TEST_FUNCTION_FAIL(
+ "Smaller than lower bound!\n"
+ "ix = %wd\n"
+ "n = %wd\n"
+ "xp = %{ulong*}\n"
+ "Lower bound: %{ulong*}\n"
+ "Got: %{ulong*}\n",
+ ix, n, xp, n, rpcH, n + 1, rp, n + 1);
+ }
+
+ flint_free(xp);
+ flint_free(rp);
}
- flint_free(rp);
flint_free(rpc);
- flint_free(xp);
TEST_FUNCTION_END(state);
}
-# undef N_MAX
-# undef rpcH
-#else
-TEST_FUNCTION_START(flint_mpn_sqrhigh, state)
-{
- TEST_FUNCTION_END_SKIPPED(state);
-}
-#endif
+#undef N_MIN
+#undef N_MAX
+#undef WANT_STOR
+#undef rpcH
+#undef MAX_ALLOC_SIZE
+
+#undef GET_N_FULL_SQR
+#undef GET_N_LARGE
+#undef GET_N_SMALL
diff --git a/src/mpn_extras/test/t-sqrhigh_basecase.c b/src/mpn_extras/test/t-sqrhigh_basecase.c
deleted file mode 100644
index 7993a94704..0000000000
--- a/src/mpn_extras/test/t-sqrhigh_basecase.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- Copyright (C) 2024 Albin Ahlbäck
-
- This file is part of FLINT.
-
- FLINT is free software: you can redistribute it and/or modify it under
- the terms of the GNU Lesser General Public License (LGPL) as published
- by the Free Software Foundation; either version 3 of the License, or
- (at your option) any later version. See .
-*/
-
-#include "test_helpers.h"
-#include "mpn_extras.h"
-
-/* TODO: Remove this preprocessor conditional */
-#if FLINT_HAVE_NATIVE_mpn_sqrhigh_basecase
-
-# define N_MIN 1
-# define N_MAX 64
-
-TEST_FUNCTION_START(flint_mpn_sqrhigh_basecase, state)
-{
- slong ix;
- int result;
-
- for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++)
- {
- mp_limb_t rp1[N_MAX + 1];
- mp_limb_t rp2[N_MAX + 1];
- mp_limb_t xp[N_MAX];
- mp_size_t n;
-
- n = N_MIN + n_randint(state, N_MAX - N_MIN + 1);
-
- mpn_random2(xp, n);
-
- if (FLINT_HAVE_MULHIGH_FUNC(n))
- rp1[0] = flint_mpn_mulhigh_func_tab[n](rp1 + 1, xp, xp);
- else
- rp1[0] = _flint_mpn_mulhigh_basecase(rp1 + 1, xp, xp, n);
-
- rp2[0] = flint_mpn_sqrhigh_basecase(rp2 + 1, xp, n);
-
- result = (mpn_cmp(rp1, rp2, n + 1) == 0);
- if (!result)
- TEST_FUNCTION_FAIL(
- "Wrong result!\n"
- "ix = %wd\n"
- "n = %wd\n"
- "xp = %{ulong*}\n"
- "Expected: %{ulong*}\n"
- "Got: %{ulong*}\n",
- ix, n, xp, n, rp1, n + 1, rp2, n + 1);
- }
-
- TEST_FUNCTION_END(state);
-}
-# undef N_MIN
-# undef N_MAX
-#else
-TEST_FUNCTION_START(flint_mpn_sqrhigh_basecase, state)
-{
- TEST_FUNCTION_END_SKIPPED(state);
-}
-#endif
diff --git a/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_even.asm b/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_even.asm
index 86f4bb0088..79215a840c 100644
--- a/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_even.asm
+++ b/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_even.asm
@@ -409,12 +409,10 @@ undefine(`ld3')
pop %rbx
ret
-
+EPILOGUE()
JUMPTABSECT
ALIGN(8)
L(etab):JMPENT( L(ep0), L(etab))
JMPENT( L(ep1), L(etab))
JMPENT( L(ep2), L(etab))
JMPENT( L(ep3), L(etab))
- TEXT
-EPILOGUE()
diff --git a/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_odd.asm b/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_odd.asm
index 3be67692f7..58767eccf1 100644
--- a/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_odd.asm
+++ b/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_odd.asm
@@ -397,12 +397,10 @@ undefine(`ld3')
pop %rbx
ret
-
+EPILOGUE()
JUMPTABSECT
ALIGN(8)
L(dtab):JMPENT( L(dp0), L(dtab))
JMPENT( L(dp1), L(dtab))
JMPENT( L(dp2), L(dtab))
JMPENT( L(dp3), L(dtab))
- TEXT
-EPILOGUE()