diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 1d8420ccb1..28d57af8c8 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -247,11 +247,11 @@ mp_limb_t mpn_rsh1sub_n(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); # define FLINT_MPN_SQR_HARD(rp, xp, n) (flint_mpn_sqr_func_tab[n](rp, xp)) #elif FLINT_HAVE_ASSEMBLY_armv8 # define FLINT_MPN_MUL_FUNC_N_TAB_WIDTH 15 -# define FLINT_MPN_SQR_FUNC_TAB_WIDTH 0 +# define FLINT_MPN_SQR_FUNC_TAB_WIDTH 9 # define FLINT_HAVE_MUL_FUNC(n, m) FLINT_HAVE_MUL_N_FUNC(n) # define FLINT_HAVE_MUL_N_FUNC(n) ((n) <= FLINT_MPN_MUL_FUNC_N_TAB_WIDTH) -# define FLINT_HAVE_SQR_FUNC(n) (0) +# define FLINT_HAVE_SQR_FUNC(n) ((n) <= FLINT_MPN_SQR_FUNC_TAB_WIDTH) # define FLINT_MPN_MUL_HARD(rp, xp, xn, yp, yn) (flint_mpn_mul_func_n_tab[xn](rp, xp, yp, yn)) # define FLINT_MPN_MUL_N_HARD(rp, xp, yp, n) (flint_mpn_mul_func_n_tab[n](rp, xp, yp, n)) @@ -417,6 +417,19 @@ FLINT_DLL extern const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_nor # define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 9 # define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 8 # define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 9 + +/* NOTE: This function only works for n >= 6 */ +# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1 + +/* NOTE: The x86_64_adx versions of these functions only works for n >= 6 */ +# define FLINT_HAVE_NATIVE_mpn_sqrhigh_basecase 1 +#elif FLINT_HAVE_ASSEMBLY_armv8 +# define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 8 +# define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 8 +# define FLINT_MPN_MULHIGH_NORMALISED_FUNC_TAB_WIDTH 0 + +/* NOTE: This function only works for n > 8 */ +# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1 #else # define FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH 16 # define FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH 2 @@ -429,8 +442,6 @@ FLINT_DLL extern const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_nor void _flint_mpn_mulhigh_n_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n); -/* NOTE: The x86_64_adx version of this function only works for n >= 6 */ -# define FLINT_HAVE_NATIVE_mpn_mulhigh_basecase 1 mp_limb_t _flint_mpn_mulhigh_basecase(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n); mp_limb_t _flint_mpn_mulhigh_n_mulders(mp_ptr res, mp_srcptr u, mp_srcptr v, mp_size_t n); @@ -468,9 +479,6 @@ void flint_mpn_mul_or_mulhigh_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t #define FLINT_MPN_SQRHIGH_SQR_CUTOFF 2000 #define FLINT_MPN_SQRHIGH_K_TAB_SIZE 2048 -/* NOTE: The x86_64_adx versions of these functions only works for n >= 6 */ -# define FLINT_HAVE_NATIVE_mpn_sqrhigh_basecase 1 - #if FLINT_HAVE_ASSEMBLY_x86_64_adx mp_limb_t _flint_mpn_sqrhigh_basecase_even(mp_ptr, mp_srcptr, mp_size_t); @@ -499,8 +507,6 @@ MPN_EXTRAS_INLINE mp_limb_t _flint_mpn_sqrhigh_basecase(mp_ptr res, mp_srcptr u, void _flint_mpn_sqrhigh_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_size_t n); mp_limb_t _flint_mpn_sqrhigh_mulders(mp_ptr res, mp_srcptr u, mp_size_t n); -mp_limb_t _flint_mpn_sqrhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_size_t n); -mp_limb_t flint_mpn_sqrhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_size_t n); mp_limb_t _flint_mpn_sqrhigh_sqr(mp_ptr res, mp_srcptr u, mp_size_t n); mp_limb_t _flint_mpn_sqrhigh(mp_ptr, mp_srcptr, mp_size_t); diff --git a/src/mpn_extras/arm64/arm64-defs.m4 b/src/mpn_extras/arm64/arm64-defs.m4 index 46149f7bfd..112b60e550 100644 --- a/src/mpn_extras/arm64/arm64-defs.m4 +++ b/src/mpn_extras/arm64/arm64-defs.m4 @@ -37,10 +37,15 @@ dnl don't want to disable macro expansions in or after them. changecom -dnl LEA_HI(reg,gmp_symbol), LEA_LO(reg,gmp_symbol) +dnl LEA_HI(reg,gmp_symbol) +dnl LEA_LO(reg,gmp_symbol) +dnl LOH_ADRPADD(labelhi,labello) dnl -dnl Load the address of gmp_symbol into a register. We split this into two -dnl parts to allow separation for manual insn scheduling. +dnl Load the address of gmp_symbol into a register. label(hi|lo) has to +dnl be unique labels that LEA_HI and LEA_LO are put on. We split this +dnl into three parts to allow separation for manual insn scheduling. +dnl LOH_ADRPADD has to be called as well, but remains unused on +dnl non-Darwin systems. ifdef(`PIC',`dnl define(`LEA_HI', `adrp $1, :got:$2')dnl @@ -50,4 +55,18 @@ define(`LEA_HI', `adrp $1, $2')dnl define(`LEA_LO', `add $1, $1, :lo12:$2')dnl ')dnl +define(`LOH_ADRPADD',`')dnl + + +dnl LBL_HI(reg,gmp_symbol) +dnl LBL_LO(reg,gmp_symbol) +dnl +dnl Load the label of gmp_symbol into a register. We split this into +dnl three parts to allow separation for manual insn scheduling. +dnl LOH_ADRPADD has to be called as well with their respective labels +dnl (*not* gmp_symbol), but remains unused on non-Darwin systems. + +define(`LBL_HI', `adrp $1, $2')dnl +define(`LBL_LO', `add $1, $1, :lo12:$2')dnl + divert`'dnl diff --git a/src/mpn_extras/arm64/darwin.m4 b/src/mpn_extras/arm64/darwin.m4 index 36e72fe45e..296f18dbb3 100644 --- a/src/mpn_extras/arm64/darwin.m4 +++ b/src/mpn_extras/arm64/darwin.m4 @@ -37,14 +37,29 @@ dnl don't want to disable macro expansions in or after them. changecom -dnl LEA_HI(reg,gmp_symbol), LEA_LO(reg,gmp_symbol) +dnl LEA_HI(reg,gmp_symbol) +dnl LEA_LO(reg,gmp_symbol) +dnl LOH_ADRPADD(labelhi,labello) dnl -dnl Load the address of gmp_symbol into a register. We split this into two -dnl parts to allow separation for manual insn scheduling. TODO: Darwin allows -dnl for relaxing these two insns into an adr and a nop, but that requires the -dnl .loh pseudo for connecting them. +dnl Load the address of gmp_symbol into a register. label(hi|lo) has +dnl to be unique labels that LEA_HI and LEA_LO are put on. We split +dnl this into three parts to allow separation for manual insn +dnl scheduling. LOH_ADRPADD has to be called as well. -define(`LEA_HI',`adrp $1, $2@GOTPAGE')dnl -define(`LEA_LO',`ldr $1, [$1, $2@GOTPAGEOFF]')dnl +define(`LEA_HI', `adrp $1, $2@GOTPAGE')dnl +define(`LEA_LO', `add $1, $1, $2@GOTPAGEOFF')dnl +define(`LOH_ADRPADD', `.loh AdrpAdd $1, $2')dnl + + +dnl LBL_HI(reg,gmp_symbol) +dnl LBL_LO(reg,gmp_symbol) +dnl +dnl Load the label of gmp_symbol into a register. We split this into +dnl three parts to allow separation for manual insn scheduling. +dnl LOH_ADRPADD has to be called as well with their respective labels +dnl (*not* gmp_symbol). + +define(`LBL_HI', `adrp $1, $2@PAGE')dnl +define(`LBL_LO', `add $1, $1, $2@PAGEOFF')dnl divert`'dnl diff --git a/src/mpn_extras/arm64/mul_hard.asm b/src/mpn_extras/arm64/mul_hard.asm index f6a9ccb94f..dbd08287d3 100644 --- a/src/mpn_extras/arm64/mul_hard.asm +++ b/src/mpn_extras/arm64/mul_hard.asm @@ -519,102 +519,120 @@ EPILOGUE() PROLOGUE(flint_mpn_mul_8n) ldr r8, [bp], #1*8 ldp r0, r1, [ap] - ldp r2, r3, [ap,#2*8] - ldp r4, r5, [ap,#4*8] - ldp r6, r7, [ap,#6*8] - - stp r14, r15, [sp, #-4*8]! + ldp r2, r3, [ap, #2*8] + stp r14, r15, [sp, #-6*8]! + ldp r4, r5, [ap, #4*8] + ldp r6, r7, [ap, #6*8] stp r16, r17, [sp, #2*8] + stp r18, r19, [sp, #4*8] -L(m8): mul r9, r0, r8 C a0 b0 +L(m8): mul r9, r0, r8 umulh r10, r0, r8 - mul r11, r1, r8 C a1 b0 - umulh r14, r1, r8 - mul ap, r2, r8 C a2 b0 - umulh r15, r2, r8 - mul r12, r3, r8 C a3 b0 + mul r11, r1, r8 + umulh r12, r1, r8 + mul r13, r2, r8 + umulh r14, r2, r8 + mul r15, r3, r8 umulh r16, r3, r8 - mul r13, r4, r8 C a4 b0 - umulh r17, r4, r8 + mul r17, r4, r8 + umulh r18, r4, r8 + mul r19, r5, r8 + umulh ap, r5, r8 + C 9, (10, 11), (12, 13), (14, 15), (16, 17), (18, 19), ap + str r9, [rp], #1*8 sub n, n, #1 adds r10, r10, r11 - adcs r14, r14, ap - adcs r15, r15, r12 - adcs r16, r16, r13 - mul r9, r5, r8 C a5 b0 - umulh r11, r5, r8 - mul ap, r6, r8 C a6 b0 - umulh r12, r6, r8 - mul r13, r7, r8 C a7 b0 - umulh r8, r7, r8 - adcs r17, r17, r9 - adcs r11, r11, ap + + mul r9, r6, r8 + umulh r11, r6, r8 + adcs r12, r12, r13 - cinc r9, r8, cs - C 10, 14, 15, 16, 17, 11, 12, 9 + adcs r14, r14, r15 + + mul r13, r7, r8 + umulh r15, r7, r8 + C 10, 12, 14, (16, 17), (18, 19), (ap, 9), (11, 13), 15 cbz n, L(f8) - stp r18, r19, [sp, #-8*8]! - stp r20, r21, [sp, #2*8] - stp r22, r23, [sp, #4*8] - str r24, [sp, #6*8] + stp r20, r21, [sp, #-6*8]! + stp r22, r23, [sp, #2*8] + str r24, [sp, #4*8] + L(a8): ldr r8, [bp], #1*8 - mul r13, r0, r8 C a0 b0 - umulh ap, r0, r8 - mul r18, r1, r8 C a1 b0 - umulh r19, r1, r8 - mul r20, r2, r8 C a2 b0 - umulh r21, r2, r8 - mul r22, r3, r8 C a3 b0 - umulh r23, r3, r8 - mul r24, r4, r8 C a4 b0 - adds ap, ap, r18 - adcs r19, r19, r20 - adcs r21, r21, r22 - adcs r23, r23, r24 + adcs r16, r16, r17 + adcs r18, r18, r19 + adcs ap, ap, r9 + adcs r11, r11, r13 + cinc r15, r15, cs + C 10, 12, 14, 16, 18, ap, 11, 15 + C Free: 20, 21, 22, 23, 24, 17, 19, 9, 13 + + mul r20, r0, r8 + mul r21, r1, r8 + mul r22, r2, r8 + mul r23, r3, r8 + mul r17, r4, r8 + mul r19, r5, r8 + mul r9, r6, r8 + mul r13, r7, r8 + + adds r20, r20, r10 + umulh r24, r7, r8 + + C (10, 20), (12, 21), (14, 22), (16, 23), (18, 17), (ap, 19), (11, 9), (15, 13), 24 + + adcs r21, r21, r12 + umulh r10, r0, r8 + + adcs r22, r22, r14 + umulh r12, r1, r8 + + adcs r23, r23, r16 + umulh r14, r2, r8 + + adcs r17, r17, r18 + umulh r16, r3, r8 + + adcs r19, r19, ap umulh r18, r4, r8 - mul r20, r5, r8 C a5 b0 - umulh r22, r5, r8 - mul r24, r6, r8 C a6 b0 - sub n, n, #1 - adcs r18, r18, r20 - adcs r22, r22, r24 - umulh r20, r6, r8 - mul r24, r7, r8 C a7 b0 - umulh r8, r7, r8 - adcs r20, r20, r24 - cinc r8, r8, cs - C 13, ap, 19, 21, 23, 18, 22, 20, 8 - - C (13, (10, 14, 15, 16, 17, 11, 12, 9)) - C <- (10, 14, 15, 16, 17, 11, 12, 9) + (13, ap, 19, 21, 23, 18, 22, 20, 8) - adds r13, r10, r13 - adcs r10, r14, ap - adcs r14, r15, r19 - adcs r15, r16, r21 - adcs r16, r17, r23 - adcs r17, r11, r18 - adcs r11, r12, r22 - adcs r12, r9, r20 - cinc r9, r8, cs - str r13, [rp], #1*8 + adcs r9, r9, r11 + umulh ap, r5, r8 + + adcs r13, r13, r15 + umulh r11, r6, r8 + + str r20, [rp], #1*8 + + cinc r15, r24, cs + adds r10, r10, r21 + adcs r12, r12, r22 + adcs r14, r14, r23 + C 10, 12, 14, (16, 17), (18, 19), (ap, 9), (11, 13), 15 + + sub n, n, #1 cbnz n, L(a8) - ldp r20, r21, [sp, #2*8] - ldp r22, r23, [sp, #4*8] - ldr r24, [sp, #6*8] - ldp r18, r19, [sp], #8*8 -L(f8): stp r10, r14, [rp] - stp r15, r16, [rp,#2*8] - stp r17, r11, [rp,#4*8] - stp r12, r9, [rp,#6*8] + ldp r22, r23, [sp, #2*8] + ldr r24, [sp, #4*8] + ldp r20, r21, [sp], #6*8 +L(f8): adcs r16, r16, r17 + adcs r18, r18, r19 + stp r10, r12, [rp] + adcs ap, ap, r9 + adcs r11, r11, r13 + stp r14, r16, [rp, #2*8] + cinc r15, r15, cs ldp r16, r17, [sp, #2*8] - ldp r14, r15, [sp], #4*8 - mov res, r9 + stp r18, ap, [rp, #4*8] + ldp r18, r19, [sp, #4*8] + stp r11, r15, [rp, #6*8] + mov res, r15 + ldp r14, r15, [sp], #6*8 + ret EPILOGUE() diff --git a/src/mpn_extras/arm64/mulhigh_basecase.asm b/src/mpn_extras/arm64/mulhigh_basecase.asm new file mode 100644 index 0000000000..3fbd446b5b --- /dev/null +++ b/src/mpn_extras/arm64/mulhigh_basecase.asm @@ -0,0 +1,419 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl Uses code adopted from the GNU MP Library. +dnl +dnl Copyright 2020 Free Software Foundation, Inc. +dnl Contributed to the GNU project by Torbjörn Granlund. +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +define(`rp', `x0') +define(`ap', `x1') +define(`bp', `x2') +define(`n', `x3') + +define(`r0', `x4') +define(`r1', `x5') +define(`r2', `x6') +define(`r3', `x7') +define(`r4', `x8') +define(`r5', `x9') +define(`r6', `x10') +define(`r7', `x11') +define(`r8', `x12') +define(`r9', `x13') +define(`r10', `x14') +define(`r11', `x15') +define(`r12', `x16') +define(`r13', `x17') + +dnl NOTE: The following has to be pushed and popped +dnl NOTE: At least on Apple, we need to preserve both x18 and x29 at all times! +define(`r14', `x19') +define(`r15', `x20') +define(`r16', `x21') +define(`r17', `x22') +define(`r18', `x23') +define(`r19', `x24') +define(`r20', `x25') +define(`r21', `x26') +define(`r22', `x27') +define(`r23', `x28') +define(`r24', `x30') + +define(`res', `rp') dnl NOTE: Synonymous with rp! + +dnl Idea is to compute the first 9 by 8 block, then continue via addmul chains. + +dnl n = 10: +dnl +dnl 0 1 2 3 4 5 6 7 8 9 +dnl 0 _ _ _ _ _ _ _ h x +dnl 1 h¦x x +dnl 2 h x¦x x +dnl 3 h x x¦x x +dnl 4 h x x x¦x x +dnl 5 h x x x x¦x x +dnl 6 h x x x x x¦x x +dnl 7 h x x x x x x¦x x +dnl 8 h x x x x x x x¦x x +dnl 9 x x x x x x x x¦x x + +PROLOGUE(_flint_mpn_mulhigh_basecase) + add ap, ap, n, lsl #3 + sub ap, ap, #9*8 + stp r14, r15, [sp, #-10*8]! + ldp r9, r10, [bp] + ldp r6, r7, [ap, #6*8] + ldr r8, [ap, #8*8] + + ldp r11, r12, [bp, #2*8] + stp r16, r17, [sp, #2*8] + ldp r13, r14, [bp, #4*8] + stp r18, r19, [sp, #4*8] + + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + + stp r20, r21, [sp, #6*8] + ldp r15, r16, [bp, #6*8] + stp r22, r23, [sp, #8*8] + + C ap: 0, 1, ..., 8 + C bp: 9, ..., 16 + C Free: 17, ..., 23 + + umulh r17, r7, r9 + mul r18, r8, r9 + umulh r9, r8, r9 + umulh r19, r7, r10 + umulh r20, r8, r10 + umulh r21, r6, r10 + mul r22, r7, r10 + mul r10, r8, r10 + C (17, 18, 21, 22), (9, 19, 10), 20 + C Free: 23 + + umulh r23, r5, r11 + adds r17, r17, r18 + umulh r18, r6, r11 + adcs r9, r9, r19 + umulh r19, r7, r11 + cinc r20, r20, cs + adds r17, r17, r21 + umulh r21, r8, r11 + adcs r9, r9, r10 + mul r10, r6, r11 + cinc r20, r20, cs + adds r17, r17, r22 + mul r22, r7, r11 + mul r11, r8, r11 + + adcs r9, r9, r18 + umulh r18, r4, r12 + adcs r20, r20, r19 + umulh r19, r5, r12 + cinc r21, r21, cs + adds r17, r17, r23 + umulh r23, r6, r12 + adcs r9, r9, r22 + umulh r22, r7, r12 + adcs r20, r20, r11 + umulh r11, r8, r12 + cinc r21, r21, cs + adds r17, r17, r10 + mul r10, r5, r12 + adcs r9, r9, r19 + mul r19, r6, r12 + adcs r20, r20, r23 + mul r23, r7, r12 + mul r12, r8, r12 + + adcs r21, r21, r22 + umulh r22, r3, r13 + cinc r11, r11, cs + adds r17, r17, r18 + umulh r18, r4, r13 + adcs r9, r9, r19 + umulh r19, r5, r13 + adcs r20, r20, r23 + umulh r23, r6, r13 + adcs r21, r21, r12 + umulh r12, r7, r13 + cinc r11, r11, cs + adds r17, r17, r10 + umulh r10, r8, r13 + adcs r9, r9, r18 + mul r18, r4, r13 + adcs r20, r20, r19 + mul r19, r5, r13 + adcs r21, r21, r23 + mul r23, r6, r13 + adcs r11, r11, r12 + mul r12, r7, r13 + mul r13, r8, r13 + + cinc r10, r10, cs + adds r17, r17, r22 + umulh r22, r2, r14 + adcs r9, r9, r19 + umulh r19, r3, r14 + adcs r20, r20, r23 + umulh r23, r4, r14 + adcs r21, r21, r12 + umulh r12, r5, r14 + adcs r11, r11, r13 + umulh r13, r6, r14 + cinc r10, r10, cs + adds r17, r17, r18 + umulh r18, r7, r14 + adcs r9, r9, r19 + umulh r19, r8, r14 + adcs r20, r20, r23 + mul r23, r3, r14 + adcs r21, r21, r12 + mul r12, r4, r14 + adcs r11, r11, r13 + mul r13, r5, r14 + adcs r10, r10, r18 + mul r18, r6, r14 + cinc r19, r19, cs + adds r17, r17, r22 + mul r22, r7, r14 + mul r14, r8, r14 + + adcs r9, r9, r12 + umulh r12, r1, r15 + adcs r20, r20, r13 + umulh r13, r2, r15 + adcs r21, r21, r18 + umulh r18, r3, r15 + adcs r11, r11, r22 + umulh r22, r4, r15 + adcs r10, r10, r14 + umulh r14, r5, r15 + cinc r19, r19, cs + adds r17, r17, r23 + umulh r23, r6, r15 + adcs r9, r9, r13 + umulh r13, r7, r15 + adcs r20, r20, r18 + umulh r18, r8, r15 + adcs r21, r21, r22 + mul r22, r2, r15 + adcs r11, r11, r14 + mul r14, r3, r15 + adcs r10, r10, r23 + mul r23, r4, r15 + adcs r19, r19, r13 + mul r13, r5, r15 + cinc r18, r18, cs + adds r17, r17, r12 + mul r12, r6, r15 + adcs r9, r9, r14 + mul r14, r7, r15 + mul r15, r8, r15 + + adcs r20, r20, r23 + umulh r23, r1, r16 + adcs r21, r21, r13 + umulh r13, r2, r16 + adcs r11, r11, r12 + umulh r12, r3, r16 + adcs r10, r10, r14 + umulh r14, r4, r16 + adcs r19, r19, r15 + umulh r15, r5, r16 + cinc r18, r18, cs + adds r17, r17, r22 + umulh r22, r6, r16 + adcs r9, r9, r23 + umulh r23, r7, r16 + adcs r20, r20, r13 + umulh r13, r8, r16 + mul r1, r1, r16 + adcs r21, r21, r12 + mul r2, r2, r16 + adcs r11, r11, r14 + mul r3, r3, r16 + adcs r10, r10, r15 + mul r4, r4, r16 + adcs r19, r19, r22 + mul r5, r5, r16 + adcs r18, r18, r23 + mul r6, r6, r16 + cinc r13, r13, cs + mul r7, r7, r16 + adds r1, r17, r1 C Store lowest limb (return value) in r1 + mul r8, r8, r16 + adcs r9, r9, r2 + umulh r2, r0, r16 C Save r2 for next addmul + adcs r20, r20, r3 + stp r9, r20, [rp] + adcs r21, r21, r4 + adcs r11, r11, r5 + stp r21, r11, [rp, #2*8] + adcs r10, r10, r6 + adcs r19, r19, r7 + stp r10, r19, [rp, #4*8] + adcs r18, r18, r8 + cinc r13, r13, cs + stp r18, r13, [rp, #6*8] + +define(`b0', `r0') C loaded bp value +define(`rx', `r1') C return value +define(`cy', `r2') C carry for next iteration in loop + +define(`ix', `r3') C iterator +define(`ixsave',`r4') C saved iterator +define(`ns', `r5') C resetter for ap and rp + +define(`jmp', `r6') C jump values +dnl 7 and 20, ..., 23 will remain unused for the rest of the program + +L(hi): LBL_HI( jmp, L(p1h)) + mov ixsave, #2 + mov ix, #2 + ldp r20, r21, [sp, #6*8] C No longer in use + ldp r22, r23, [sp, #8*8] +L(lo): LBL_LO( jmp, L(p1h)) + LOH_ADRPADD(L(hi), L(lo)) + ldr b0, [bp, #8*8]! + mov ns, #8*8 + sub n, n, #8 + +L(p1h): ldr r8, [ap], #1*8 + mul r12, r8, b0 + umulh r8, r8, b0 + add jmp, jmp, #9*4 C jmp goes from p1h to p2h + adds rx, rx, cy + cinc r8, r8, cs + adds rx, rx, r12 + cinc cy, r8, cs + b L(top) + +L(p2h): ldp r8, r9, [ap], #2*8 + ldr r17, [rp] + mul r12, r8, b0 + umulh r8, r8, b0 + mul r13, r9, b0 + umulh r9, r9, b0 + add jmp, jmp, #15*4 C jmp goes from p2h to p3h + adds rx, rx, cy + adcs r8, r17, r8 + cinc r9, r9, cs + adds rx, rx, r12 + adcs r8, r8, r13 + cinc cy, r9, cs + str r8, [rp], #1*8 + b L(top) + +L(p3h): ldp r8, r9, [ap], #2*8 + ldr r10, [ap], #1*8 + ldp r17, r18, [rp] + mul r12, r8, b0 + umulh r8, r8, b0 + mul r13, r9, b0 + umulh r9, r9, b0 + mul r14, r10, b0 + umulh r10, r10, b0 + add jmp, jmp, #20*4 C jmp goes from p3h to p0h + adds rx, rx, cy + adcs r8, r17, r8 + adcs r9, r18, r9 + cinc r10, r10, cs + adds rx, rx, r12 + adcs r8, r8, r13 + adcs r9, r9, r14 + cinc cy, r10, cs + stp r8, r9, [rp], #2*8 + b L(top) + +L(p0h): ldp r8, r9, [ap], #2*8 + ldp r10, r11, [ap], #2*8 + ldp r17, r18, [rp] + ldr r19, [rp, #2*8] + add ixsave, ixsave, #1 + sub jmp, jmp, #44*4 C jmp goes from p0h to p1h + mul r12, r8, b0 + umulh r8, r8, b0 + mul r13, r9, b0 + umulh r9, r9, b0 + mul r14, r10, b0 + umulh r10, r10, b0 + mul r15, r11, b0 + umulh r11, r11, b0 + adds rx, rx, cy + adcs r8, r17, r8 + adcs r9, r18, r9 + adcs r10, r19, r10 + cinc r11, r11, cs + adds rx, rx, r12 + adcs r8, r8, r13 + adcs r9, r9, r14 + adcs r10, r10, r15 + cinc cy, r11, cs + stp r8, r9, [rp], #2*8 + str r10, [rp], #1*8 + +L(top): ldp r8, r9, [ap], #2*8 + ldp r10, r11, [ap], #2*8 + ldp r16, r17, [rp] + ldp r18, r19, [rp, #2*8] + mul r12, r8, b0 + umulh r8, r8, b0 + mul r13, r9, b0 + umulh r9, r9, b0 + mul r14, r10, b0 + umulh r10, r10, b0 + mul r15, r11, b0 + umulh r11, r11, b0 + adds r12, r16, r12 + adcs r8, r17, r8 + adcs r9, r18, r9 + adcs r10, r19, r10 + cinc r11, r11, cs + adds r12, r12, cy + adcs r8, r8, r13 + adcs r9, r9, r14 + adcs r10, r10, r15 + cinc cy, r11, cs + stp r12, r8, [rp], #2*8 + stp r9, r10, [rp], #2*8 + sub ix, ix, #1 + cbnz ix, L(top) + +L(end): str cy, [rp] + + sub n, n, #1 + cbz n, L(fin) + + sub rp, rp, ns C Reset rp + sub ap, ap, ns C Reset ap, part 1 + add ns, ns, #1*8 C Increase reseter + + ldr cy, [ap, #-2*8]! C Reset ap, part 2 + umulh cy, cy, b0 + + mov ix, ixsave + ldr b0, [bp, #1*8]! + br jmp + +L(fin): ldp r16, r17, [sp, #2*8] + ldp r18, r19, [sp, #4*8] + ldp r14, r15, [sp], #10*8 + + mov res, rx + + ret +EPILOGUE() diff --git a/src/mpn_extras/arm64/mulhigh_hard.asm b/src/mpn_extras/arm64/mulhigh_hard.asm new file mode 100644 index 0000000000..0f5ac250c4 --- /dev/null +++ b/src/mpn_extras/arm64/mulhigh_hard.asm @@ -0,0 +1,793 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +define(`rp', `x0') +define(`ap', `x1') +define(`bp', `x2') + +define(`r0', `x3') +define(`r1', `x4') +define(`r2', `x5') +define(`r3', `x6') +define(`r4', `x7') +define(`r5', `x8') +define(`r6', `x9') +define(`r7', `x10') +define(`r8', `x11') +define(`r9', `x12') +define(`r10', `x13') +define(`r11', `x14') +define(`r12', `x15') +define(`r13', `x16') +define(`r14', `x17') + +define(`r15', `ap') dnl Beware! +define(`r16', `bp') + +dnl NOTE: The following has to be pushed and popped +dnl NOTE: At least on Apple, we need to preserve both x18 and x29 at all times! +define(`r17', `x19') +define(`r18', `x20') +define(`r19', `x21') +define(`r20', `x22') +define(`r21', `x23') +define(`r22', `x24') +define(`r23', `x25') +define(`r24', `x26') +define(`r25', `x27') +define(`r26', `x28') +define(`r27', `x30') + +define(`res', `rp') dnl NOTE: Synonymous with rp! + +PROLOGUE(flint_mpn_mulhigh_1) + ldr r0, [ap] + ldr r1, [bp] + + umulh r3, r0, r1 + mul r2, r0, r1 + + str r3, [rp] + mov res, r2 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_mulhigh_2) + ldp r0, r1, [ap] + ldp r2, r3, [bp] + + umulh r4, r0, r2 + + mul r5, r0, r3 + umulh r6, r0, r3 + + mul r7, r1, r2 + umulh r8, r1, r2 + + mul r9, r1, r3 + umulh r10, r1, r3 + C (4, 5, 7), (6, 8, 9), 10 + + adds r4, r4, r5 + adcs r6, r6, r8 + cinc r10, r10, cs + adds r4, r4, r7 + adcs r6, r6, r9 + cinc r10, r10, cs + + stp r6, r10, [rp] + mov res, r4 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_mulhigh_3) + ldp r0, r1, [ap] + ldp r3, r4, [bp] + ldr r2, [ap, #2*8] + ldr r5, [bp, #2*8] + + umulh r6, r1, r3 + mul r7, r2, r3 + umulh r3, r2, r3 + umulh r8, r1, r4 + umulh r9, r2, r4 + umulh r10, r0, r4 + mul r11, r2, r4 + mul r12, r0, r5 + + adds r6, r6, r7 + + umulh r0, r0, r5 + umulh r13, r1, r5 + + adcs r3, r3, r8 + + umulh r14, r2, r5 + mul r4, r1, r4 + + cinc r9, r9, cs + adds r6, r6, r10 + + mul r1, r1, r5 + mul r2, r2, r5 + + adcs r3, r3, r11 + cinc r9, r9, cs + + adds r6, r6, r12 + adcs r3, r3, r0 + adcs r9, r9, r13 + cinc r14, r14, cs + + adds r6, r6, r4 + adcs r3, r3, r1 + adcs r9, r9, r2 + cinc r14, r14, cs + + stp r3, r9, [rp] + str r14, [rp, #2*8] + mov res, r6 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_mulhigh_4) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [bp] + ldp r6, r7, [bp, #2*8] + + umulh r8, r2, r4 + mul r9, r3, r4 + umulh r4, r3, r4 + umulh r12, r2, r5 + umulh r14, r3, r5 + umulh r10, r1, r5 + mul r11, r2, r5 + mul r13, r3, r5 + C (8, 9, 10, 11), (4, 12, 13), 14 + C Free: 5, 15, 16 + + adds r8, r8, r9 + + umulh r9, r0, r6 + umulh r5, r1, r6 + + adcs r4, r4, r12 + + umulh r12, r2, r6 + umulh r15, r3, r6 + + cinc r14, r14, cs + adds r8, r8, r10 + + mul r16, r1, r6 + mul r10, r2, r6 + + adcs r4, r4, r13 + cinc r14, r14, cs + adds r8, r8, r11 + + mul r13, r3, r6 + umulh r11, r0, r7 + + adcs r4, r4, r5 + adcs r14, r14, r12 + + umulh r5, r1, r7 + umulh r12, r2, r7 + + cinc r15, r15, cs + adds r8, r8, r9 + adcs r4, r4, r10 + + umulh r9, r3, r7 + mul r10, r0, r7 + + adcs r14, r14, r13 + cinc r15, r15, cs + adds r8, r8, r16 + + mul r13, r1, r7 + mul r16, r2, r7 + + adcs r4, r4, r11 + adcs r14, r14, r5 + + mul r11, r3, r7 + + adcs r15, r15, r12 + cinc r9, r9, cs + C (8, 10), (4, 13), (14, 16), (15, 11), 9 + + adds r8, r8, r10 + adcs r4, r4, r13 + adcs r14, r14, r16 + adcs r15, r15, r11 + cinc r9, r9, cs + + stp r4, r14, [rp] + stp r15, r9, [rp, #2*8] + mov res, r8 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_mulhigh_5) + ldp r2, r3, [ap, #2*8] + ldr r4, [ap, #4*8] + ldp r5, r6, [bp] + ldp r0, r1, [ap] + ldp r7, r8, [bp, #2*8] + ldr r9, [bp, #4*8] + + umulh r10, r3, r5 + mul r11, r4, r5 + umulh r5, r4, r5 + umulh r12, r3, r6 + umulh r13, r4, r6 + umulh r14, r2, r6 + mul r15, r3, r6 + mul r6, r4, r6 + C (10, 11, 14, 15), (5, 12, 6), 13 + C Free: 16 + + adds r10, r10, r11 + umulh r16, r1, r7 + umulh r11, r2, r7 + adcs r5, r5, r12 + umulh r12, r3, r7 + cinc r13, r13, cs + adds r10, r10, r14 + umulh r14, r4, r7 + adcs r5, r5, r6 + mul r6, r2, r7 + cinc r13, r13, cs + adds r10, r10, r15 + mul r15, r3, r7 + adcs r5, r5, r11 + mul r7, r4, r7 + umulh r11, r0, r8 + adcs r13, r13, r12 + umulh r12, r1, r8 + cinc r14, r14, cs + adds r10, r10, r16 + umulh r16, r2, r8 + adcs r5, r5, r15 + umulh r15, r3, r8 + adcs r13, r13, r7 + umulh r7, r4, r8 + cinc r14, r14, cs + adds r10, r10, r6 + mul r6, r1, r8 + adcs r5, r5, r12 + mul r12, r2, r8 + adcs r13, r13, r16 + mul r16, r3, r8 + adcs r14, r14, r15 + mul r8, r4, r8 + umulh r15, r0, r9 + cinc r7, r7, cs + adds r10, r10, r11 + umulh r11, r1, r9 + adcs r5, r5, r12 + umulh r12, r2, r9 + adcs r13, r13, r16 + umulh r16, r3, r9 + adcs r14, r14, r8 + umulh r8, r4, r9 + mul r0, r0, r9 + cinc r7, r7, cs + mul r1, r1, r9 + mul r2, r2, r9 + mul r3, r3, r9 + mul r4, r4, r9 + + adds r10, r10, r6 + adcs r5, r5, r15 + adcs r13, r13, r11 + adcs r14, r14, r12 + adcs r7, r7, r16 + cinc r8, r8, cs + adds r10, r10, r0 + adcs r5, r5, r1 + adcs r13, r13, r2 + adcs r14, r14, r3 + stp r5, r13, [rp] + adcs r7, r7, r4 + cinc r8, r8, cs + stp r14, r7, [rp, #2*8] + str r8, [rp, #4*8] + mov res, r10 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_mulhigh_6) + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + ldp r6, r7, [bp] + + ldp r0, r1, [ap] + ldp r8, r9, [bp, #2*8] + ldp r10, r11, [bp, #4*8] + + stp r17, r18, [sp, #-2*8]! + + umulh r12, r4, r6 + mul r13, r5, r6 + umulh r6, r5, r6 + umulh r14, r4, r7 + umulh r15, r5, r7 + umulh r16, r3, r7 + mul r17, r4, r7 + mul r7, r5, r7 + C (12, 13, 16, 17), (6, 14, 7), 15 + C Free: 18 + + umulh r18, r2, r8 + adds r12, r12, r13 + umulh r13, r3, r8 + adcs r6, r6, r14 + umulh r14, r4, r8 + cinc r15, r15, cs + adds r12, r12, r16 + umulh r16, r5, r8 + adcs r6, r6, r7 + mul r7, r3, r8 + cinc r15, r15, cs + adds r12, r12, r17 + mul r17, r4, r8 + mul r8, r5, r8 + adcs r6, r6, r13 + umulh r13, r1, r9 + adcs r15, r15, r14 + umulh r14, r2, r9 + cinc r16, r16, cs + adds r12, r12, r18 + umulh r18, r3, r9 + adcs r6, r6, r17 + umulh r17, r4, r9 + adcs r15, r15, r8 + umulh r8, r5, r9 + cinc r16, r16, cs + adds r12, r12, r7 + mul r7, r2, r9 + adcs r6, r6, r14 + mul r14, r3, r9 + adcs r15, r15, r18 + mul r18, r4, r9 + mul r9, r5, r9 + adcs r16, r16, r17 + umulh r17, r0, r10 + cinc r8, r8, cs + adds r12, r12, r13 + umulh r13, r1, r10 + adcs r6, r6, r14 + umulh r14, r2, r10 + adcs r15, r15, r18 + umulh r18, r3, r10 + adcs r16, r16, r9 + umulh r9, r4, r10 + cinc r8, r8, cs + adds r12, r12, r7 + umulh r7, r5, r10 + adcs r6, r6, r13 + mul r13, r1, r10 + adcs r15, r15, r14 + mul r14, r2, r10 + adcs r16, r16, r18 + mul r18, r3, r10 + adcs r8, r8, r9 + mul r9, r4, r10 + mul r10, r5, r10 + cinc r7, r7, cs + adds r12, r12, r17 + umulh r17, r0, r11 + adcs r6, r6, r14 + umulh r14, r1, r11 + adcs r15, r15, r18 + umulh r18, r2, r11 + adcs r16, r16, r9 + umulh r9, r3, r11 + adcs r8, r8, r10 + umulh r10, r4, r11 + cinc r7, r7, cs + adds r12, r12, r13 + umulh r13, r5, r11 + mul r0, r0, r11 + adcs r6, r6, r17 + mul r1, r1, r11 + mul r2, r2, r11 + adcs r15, r15, r14 + mul r3, r3, r11 + mul r4, r4, r11 + adcs r16, r16, r18 + mul r5, r5, r11 + adcs r8, r8, r9 + adcs r7, r7, r10 + cinc r13, r13, cs + ldp r17, r18, [sp], #2*8 + adds r12, r12, r0 + adcs r6, r6, r1 + adcs r15, r15, r2 + adcs r16, r16, r3 + stp r6, r15, [rp] + adcs r8, r8, r4 + adcs r7, r7, r5 + stp r16, r8, [rp, #2*8] + cinc r13, r13, cs + stp r7, r13, [rp, #4*8] + + mov res, r12 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_mulhigh_7) + ldp r4, r5, [ap, #4*8] + ldr r6, [ap, #6*8] + ldp r7, r8, [bp] + stp r17, r18, [sp, #-4*8]! + + ldp r9, r10, [bp, #2*8] + ldp r11, r12, [bp, #4*8] + ldr r13, [bp, #6*8] + stp r19, r20, [sp, #2*8] + + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + + umulh r14, r5, r7 + mul r15, r6, r7 + umulh r7, r6, r7 + umulh r16, r5, r8 + umulh r17, r6, r8 + umulh r18, r4, r8 + mul r19, r5, r8 + mul r8, r6, r8 + C (14, 15, 18, 19), (7, 16, 8), 17 + C Free: 20 + + umulh r20, r3, r9 + adds r14, r14, r15 + umulh r15, r4, r9 + adcs r7, r7, r16 + umulh r16, r5, r9 + cinc r17, r17, cs + adds r14, r14, r18 + umulh r18, r6, r9 + adcs r7, r7, r8 + mul r8, r4, r9 + cinc r17, r17, cs + adds r14, r14, r19 + mul r19, r5, r9 + adcs r7, r7, r15 + mul r9, r6, r9 + + umulh r15, r2, r10 + adcs r17, r17, r16 + umulh r16, r3, r10 + cinc r18, r18, cs + adds r14, r14, r20 + umulh r20, r4, r10 + adcs r7, r7, r19 + umulh r19, r5, r10 + adcs r17, r17, r9 + umulh r9, r6, r10 + cinc r18, r18, cs + adds r14, r14, r8 + mul r8, r3, r10 + adcs r7, r7, r16 + mul r16, r4, r10 + adcs r17, r17, r20 + mul r20, r5, r10 + adcs r18, r18, r19 + mul r10, r6, r10 + + umulh r19, r1, r11 + cinc r9, r9, cs + adds r14, r14, r15 + umulh r15, r2, r11 + adcs r7, r7, r16 + umulh r16, r3, r11 + adcs r17, r17, r20 + umulh r20, r4, r11 + adcs r18, r18, r10 + umulh r10, r5, r11 + cinc r9, r9, cs + adds r14, r14, r8 + umulh r8, r6, r11 + adcs r7, r7, r15 + mul r15, r2, r11 + adcs r17, r17, r16 + mul r16, r3, r11 + adcs r18, r18, r20 + mul r20, r4, r11 + adcs r9, r9, r10 + mul r10, r5, r11 + mul r11, r6, r11 + + cinc r8, r8, cs + adds r14, r14, r19 + umulh r19, r0, r12 + adcs r7, r7, r16 + umulh r16, r1, r12 + adcs r17, r17, r20 + umulh r20, r2, r12 + adcs r18, r18, r10 + umulh r10, r3, r12 + adcs r9, r9, r11 + umulh r11, r4, r12 + cinc r8, r8, cs + adds r14, r14, r15 + umulh r15, r5, r12 + adcs r7, r7, r16 + umulh r16, r6, r12 + adcs r17, r17, r20 + mul r20, r1, r12 + adcs r18, r18, r10 + mul r10, r2, r12 + adcs r9, r9, r11 + mul r11, r3, r12 + adcs r8, r8, r15 + mul r15, r4, r12 + cinc r16, r16, cs + adds r14, r14, r19 + mul r19, r5, r12 + mul r12, r6, r12 + + adcs r7, r7, r10 + umulh r10, r0, r13 + adcs r17, r17, r11 + umulh r11, r1, r13 + adcs r18, r18, r15 + umulh r15, r2, r13 + adcs r9, r9, r19 + umulh r19, r3, r13 + adcs r8, r8, r12 + umulh r12, r4, r13 + cinc r16, r16, cs + adds r14, r14, r20 + umulh r20, r5, r13 + adcs r7, r7, r10 + umulh r10, r6, r13 + mul r0, r0, r13 + adcs r17, r17, r11 + mul r1, r1, r13 + mul r2, r2, r13 + adcs r18, r18, r15 + mul r3, r3, r13 + mul r4, r4, r13 + adcs r9, r9, r19 + mul r5, r5, r13 + mul r6, r6, r13 + adcs r8, r8, r12 + adcs r16, r16, r20 + ldp r19, r20, [sp, #2*8] + cinc r10, r10, cs + adds r14, r14, r0 + adcs r7, r7, r1 + adcs r11, r17, r2 + adcs r15, r18, r3 + adcs r9, r9, r4 + stp r7, r11, [rp] + ldp r17, r18, [sp], #4*8 + adcs r8, r8, r5 + stp r15, r9, [rp, #2*8] + adcs r16, r16, r6 + cinc r10, r10, cs + stp r8, r16, [rp, #4*8] + str r10, [rp, #6*8] + mov res, r14 + ret +EPILOGUE() + +PROLOGUE(flint_mpn_mulhigh_8) + ldp r4, r5, [ap, #4*8] + ldp r6, r7, [ap, #6*8] + ldp r8, r9, [bp] + stp r17, r18, [sp, #-6*8]! + + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r10, r11, [bp, #2*8] + stp r19, r20, [sp, #2*8] + + ldp r12, r13, [bp, #4*8] + ldp r14, r15, [bp, #6*8] + stp r21, r22, [sp, #4*8] + + umulh r16, r6, r8 + mul r17, r7, r8 + umulh r8, r7, r8 + umulh r18, r6, r9 + umulh r19, r7, r9 + umulh r20, r5, r9 + mul r21, r6, r9 + mul r9, r7, r9 + C (16, 17, 20, 21), (8, 18, 9), 19 + C Free: 22 + + umulh r22, r4, r10 + adds r16, r16, r17 + umulh r17, r5, r10 + adcs r8, r8, r18 + umulh r18, r6, r10 + cinc r19, r19, cs + adds r16, r16, r20 + umulh r20, r7, r10 + adcs r8, r8, r9 + mul r9, r5, r10 + cinc r19, r19, cs + adds r16, r16, r21 + mul r21, r6, r10 + mul r10, r7, r10 + + adcs r8, r8, r17 + umulh r17, r3, r11 + adcs r19, r19, r18 + umulh r18, r4, r11 + cinc r20, r20, cs + adds r16, r16, r22 + umulh r22, r5, r11 + adcs r8, r8, r21 + umulh r21, r6, r11 + adcs r19, r19, r10 + umulh r10, r7, r11 + cinc r20, r20, cs + adds r16, r16, r9 + mul r9, r4, r11 + adcs r8, r8, r18 + mul r18, r5, r11 + adcs r19, r19, r22 + mul r22, r6, r11 + mul r11, r7, r11 + + adcs r20, r20, r21 + umulh r21, r2, r12 + cinc r10, r10, cs + adds r16, r16, r17 + umulh r17, r3, r12 + adcs r8, r8, r18 + umulh r18, r4, r12 + adcs r19, r19, r22 + umulh r22, r5, r12 + adcs r20, r20, r11 + umulh r11, r6, r12 + cinc r10, r10, cs + adds r16, r16, r9 + umulh r9, r7, r12 + adcs r8, r8, r17 + mul r17, r3, r12 + adcs r19, r19, r18 + mul r18, r4, r12 + adcs r20, r20, r22 + mul r22, r5, r12 + adcs r10, r10, r11 + mul r11, r6, r12 + mul r12, r7, r12 + + cinc r9, r9, cs + adds r16, r16, r21 + umulh r21, r1, r13 + adcs r8, r8, r18 + umulh r18, r2, r13 + adcs r19, r19, r22 + umulh r22, r3, r13 + adcs r20, r20, r11 + umulh r11, r4, r13 + adcs r10, r10, r12 + umulh r12, r5, r13 + cinc r9, r9, cs + adds r16, r16, r17 + umulh r17, r6, r13 + adcs r8, r8, r18 + umulh r18, r7, r13 + adcs r19, r19, r22 + mul r22, r2, r13 + adcs r20, r20, r11 + mul r11, r3, r13 + adcs r10, r10, r12 + mul r12, r4, r13 + adcs r9, r9, r17 + mul r17, r5, r13 + cinc r18, r18, cs + adds r16, r16, r21 + mul r21, r6, r13 + mul r13, r7, r13 + + adcs r8, r8, r11 + umulh r11, r0, r14 + adcs r19, r19, r12 + umulh r12, r1, r14 + adcs r20, r20, r17 + umulh r17, r2, r14 + adcs r10, r10, r21 + umulh r21, r3, r14 + adcs r9, r9, r13 + umulh r13, r4, r14 + cinc r18, r18, cs + adds r16, r16, r22 + umulh r22, r5, r14 + adcs r8, r8, r12 + umulh r12, r6, r14 + adcs r19, r19, r17 + umulh r17, r7, r14 + adcs r20, r20, r21 + mul r21, r1, r14 + adcs r10, r10, r13 + mul r13, r2, r14 + adcs r9, r9, r22 + mul r22, r3, r14 + adcs r18, r18, r12 + mul r12, r4, r14 + cinc r17, r17, cs + adds r16, r16, r11 + mul r11, r5, r14 + adcs r8, r8, r13 + mul r13, r6, r14 + mul r14, r7, r14 + + adcs r19, r19, r22 + umulh r22, r0, r15 + adcs r20, r20, r12 + umulh r12, r1, r15 + adcs r10, r10, r11 + umulh r11, r2, r15 + adcs r9, r9, r13 + umulh r13, r3, r15 + adcs r18, r18, r14 + umulh r14, r4, r15 + cinc r17, r17, cs + adds r16, r16, r21 + umulh r21, r5, r15 + adcs r8, r8, r22 + umulh r22, r6, r15 + adcs r19, r19, r12 + umulh r12, r7, r15 + mul r0, r0, r15 + adcs r20, r20, r11 + mul r1, r1, r15 + adcs r10, r10, r13 + mul r2, r2, r15 + adcs r9, r9, r14 + mul r3, r3, r15 + adcs r18, r18, r21 + mul r4, r4, r15 + adcs r17, r17, r22 + mul r5, r5, r15 + cinc r12, r12, cs + ldp r21, r22, [sp, #4*8] + mul r6, r6, r15 + adds r16, r16, r0 + mul r7, r7, r15 + adcs r8, r8, r1 + adcs r0, r19, r2 + adcs r1, r20, r3 + stp r8, r0, [rp] + ldp r19, r20, [sp, #2*8] + adcs r10, r10, r4 + adcs r9, r9, r5 + stp r1, r10, [rp, #2*8] + adcs r6, r18, r6 + adcs r7, r17, r7 + ldp r17, r18, [sp], #6*8 + stp r9, r6, [rp, #4*8] + cinc r12, r12, cs + stp r7, r12, [rp, #6*8] + mov res, r16 + ret +EPILOGUE() diff --git a/src/mpn_extras/arm64/sqr_hard.asm b/src/mpn_extras/arm64/sqr_hard.asm new file mode 100644 index 0000000000..4fe56fbdc0 --- /dev/null +++ b/src/mpn_extras/arm64/sqr_hard.asm @@ -0,0 +1,1376 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +define(`rp', `x0') +define(`ap', `x1') + +define(`r0', `x2') +define(`r1', `x3') +define(`r2', `x4') +define(`r3', `x5') +define(`r4', `x6') +define(`r5', `x7') +define(`r6', `x8') +define(`r7', `x9') +define(`r8', `x10') +define(`r9', `x11') +define(`r10', `x12') +define(`r11', `x13') +define(`r12', `x14') +define(`r13', `x15') +define(`r14', `x16') +define(`r15', `x17') + +define(`zr', `xzr') + +define(`res', `rp') dnl NOTE: Synonymous with rp! + +define(`r16', `ap') dnl Beware! + +dnl NOTE: The following has to be pushed and popped +dnl NOTE: At least on Apple, we need to preserve both x18 and x29 at all times! +define(`r17', `x19') +define(`r18', `x20') +define(`r19', `x21') +define(`r20', `x22') +define(`r21', `x23') +define(`r22', `x24') +define(`r23', `x25') +define(`r24', `x26') +define(`r25', `x27') +define(`r26', `x28') +define(`r27', `x30') + +PROLOGUE(flint_mpn_sqr_1) + ldr r0, [ap] + + mul r1, r0, r0 + umulh r2, r0, r0 + + stp r1, r2, [rp] + mov res, r2 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_sqr_2) + ldp r0, r1, [ap] + + mul r2, r0, r1 + umulh r3, r0, r1 + C 2, 3 + + mul r4, r0, r0 + umulh r0, r0, r0 + mul r5, r1, r1 + umulh r1, r1, r1 + C 4, 0, 5, 1 + + adds r2, r2, r2 + adcs r3, r3, r3 + cset r6, cs + C 2, 3, 6 + + adds r0, r0, r2 + adcs r5, r5, r3 + adc r1, r1, r6 + C 4, 0, 5, 1 + + stp r4, r0, [rp] + stp r5, r1, [rp, #2*8] + mov res, r1 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_sqr_3) + ldp r0, r1, [ap] + ldr r2, [ap, #2*8] + + mul r3, r0, r1 + umulh r4, r0, r1 + mul r5, r0, r2 + umulh r6, r0, r2 + mul r7, r1, r2 + umulh r8, r1, r2 + C 3, (4, 5), (6, 7), 8 + + mul r9, r0, r0 + umulh r0, r0, r0 + mul r10, r1, r1 + umulh r1, r1, r1 + + adds r4, r4, r5 + adcs r6, r6, r7 + cinc r8, r8, cs + C 3, 4, 6, 8 + + mul r11, r2, r2 + umulh r2, r2, r2 + C 9, 0, 10, 1, 11, 2 + + adds r3, r3, r3 + adcs r4, r4, r4 + adcs r6, r6, r6 + adcs r8, r8, r8 + cset r12, cs + C 3, 4, 6, 8, 12 + + C 3, 4, 6, 8, 12 + C 9, 0, 10, 1, 11, 2 + adds r0, r0, r3 + + adcs r10, r10, r4 + adcs r1, r1, r6 + adcs r11, r11, r8 + + stp r9, r0, [rp] + + adc r2, r2, r12 + + stp r10, r1, [rp, #2*8] + stp r11, r2, [rp, #4*8] + mov res, r2 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_sqr_4) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + + mul r4, r0, r1 + umulh r5, r0, r1 + mul r6, r0, r2 + umulh r7, r0, r2 + mul r8, r0, r3 + umulh r9, r0, r3 + umulh r10, r1, r2 + umulh r11, r1, r3 + C 4, (5, 6), (7, 8), (9, 10), 11 + + mul r12, r1, r2 + mul r13, r1, r3 + + adds r5, r5, r6 + + mul r14, r2, r3 + umulh r15, r2, r3 + + adcs r7, r7, r8 + + mul r6, r0, r0 + umulh r0, r0, r0 + + adcs r9, r9, r10 + + mul r8, r1, r1 + umulh r1, r1, r1 + + cinc r11, r11, cs + + adds r7, r7, r12 + adcs r9, r9, r13 + + mul r10, r2, r2 + umulh r2, r2, r2 + + adcs r11, r11, r14 + cinc r15, r15, cs + C 4, 5, 7, 9, 11, 15 + + adds r4, r4, r4 + + mul r12, r3, r3 + umulh r3, r3, r3 + C 6, 0, 8, 1, 10, 2, 12, 3 + + adcs r5, r5, r5 + adcs r7, r7, r7 + adcs r9, r9, r9 + adcs r11, r11, r11 + adcs r15, r15, r15 + cset r13, cs + C 4, 5, 7, 9, 11, 15, 13 + + C 4, 5, 7, 9, 11, 15, 13 + C 6, 0, 8, 1, 10, 2, 12, 3 + adds r0, r0, r4 + adcs r8, r8, r5 + adcs r1, r1, r7 + stp r6, r0, [rp] + adcs r10, r10, r9 + adcs r2, r2, r11 + adcs r12, r12, r15 + stp r8, r1, [rp, #2*8] + adc r3, r3, r13 + stp r10, r2, [rp, #4*8] + stp r12, r3, [rp, #6*8] + mov res, r3 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_sqr_5) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldr r4, [ap, #4*8] + + stp r17, r18, [sp, #-2*8]! + + mul r5, r0, r1 + umulh r6, r0, r1 + mul r7, r0, r2 + umulh r8, r0, r2 + mul r9, r0, r3 + umulh r10, r0, r3 + mul r11, r0, r4 + umulh r12, r0, r4 + umulh r13, r1, r3 + umulh r14, r1, r4 + + adds r6, r6, r7 + + mul r15, r1, r2 + umulh r16, r1, r2 + + adcs r8, r8, r9 + + mul r17, r1, r4 + umulh r18, r2, r3 + + adcs r10, r10, r11 + + mul r7, r1, r3 + mul r9, r2, r3 + + adcs r12, r12, r13 + cinc r14, r14, cs + + mul r11, r2, r4 + umulh r13, r2, r4 + + adds r8, r8, r15 + adcs r10, r10, r16 + + mul r15, r3, r4 + umulh r16, r3, r4 + + adcs r12, r12, r17 + adcs r14, r14, r18 + cinc r13, r13, cs + C 5, 6, 8, (10, 7), (12, 9), (14, 11), (13, 15), 16 + C 17, 18 + + mul r17, r0, r0 + umulh r0, r0, r0 + + adds r10, r10, r7 + adcs r12, r12, r9 + + mul r18, r1, r1 + umulh r1, r1, r1 + + adcs r14, r14, r11 + + mul r7, r2, r2 + umulh r2, r2, r2 + + adcs r13, r13, r15 + cinc r16, r16, cs + C 5, 6, 8, 10, 12, 14, 13, 16 + C 17, 0, 18, 1, 7, 2 + C Free: 9, 11, 15 + + adds r5, r5, r5 + + mul r9, r3, r3 + umulh r3, r3, r3 + + adcs r6, r6, r6 + adcs r8, r8, r8 + adcs r10, r10, r10 + + mul r11, r4, r4 + umulh r4, r4, r4 + + adcs r12, r12, r12 + adcs r14, r14, r14 + adcs r13, r13, r13 + adcs r16, r16, r16 + cset r15, cs + C 5, 6, 8, 10, 12, 14, 13, 16, 15 + C 17, 0, 18, 1, 7, 2, 9, 3, 11, 4 + + adds r0, r0, r5 + adcs r6, r6, r18 + adcs r1, r1, r8 + adcs r7, r7, r10 + stp r17, r0, [rp] + adcs r2, r2, r12 + adcs r9, r9, r14 + adcs r3, r3, r13 + stp r6, r1, [rp, #2*8] + adcs r11, r11, r16 + adc r4, r4, r15 + stp r7, r2, [rp, #4*8] + ldp r17, r18, [sp], #2*8 + stp r9, r3, [rp, #6*8] + stp r11, r4, [rp, #8*8] + + mov res, r4 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_sqr_6) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + + stp r17, r18, [sp, #-4*8]! + stp r19, r20, [sp, #2*8] + + C mul rX, r0, r1 First computed when doubling + umulh r6, r0, r1 + mul r7, r0, r2 + umulh r8, r0, r2 + mul r9, r0, r3 + umulh r10, r0, r3 + mul r11, r0, r4 + umulh r12, r0, r4 + mul r13, r0, r5 + + adds r6, r6, r7 + + umulh r14, r0, r5 + C 6, (8 + cs, 9), (10, 11), (12, 13), 14 + + mul r15, r1, r2 + + adcs r8, r8, r9 + + mul r16, r1, r3 + mul r17, r1, r4 + + adcs r10, r10, r11 + + mul r18, r1, r5 + umulh r19, r1, r5 + + adcs r12, r12, r13 + + umulh r20, r1, r2 + umulh r7, r1, r3 + + cinc r14, r14, cs + + umulh r9, r1, r4 + C 6, (8, 15), (10, 16, 20), (12, 17, 7), (14, 18, 9), 19 + + umulh r11, r2, r4 + + adds r8, r8, r15 + adcs r10, r10, r16 + adcs r12, r12, r17 + + umulh r13, r2, r5 + mul r15, r2, r3 + + adcs r14, r14, r18 + cinc r19, r19, cs + + umulh r16, r2, r3 + mul r17, r2, r5 + + adds r10, r10, r20 + adcs r12, r12, r7 + C 6, 8, 10, (12, 15), (14 + cs, 9, 16), (19, 11, 17), 13 + + umulh r18, r3, r4 + umulh r20, r3, r5 + + adcs r14, r14, r9 + adcs r19, r19, r11 + + mul r7, r2, r4 + mul r9, r3, r4 + + cinc r13, r13, cs + adds r12, r12, r15 + + mul r11, r3, r5 + mul r15, r4, r5 + + adcs r14, r14, r16 + adcs r19, r19, r17 + + umulh r16, r4, r5 + mul r17, r0, r1 + + adcs r13, r13, r18 + cinc r20, r20, cs + C 17, 6, 8, 10, 12, (14, 7), (19, 9), (13, 11), (20, 15), 16 + C Free: 18 + + mul r18, r0, r0 + umulh r0, r0, r0 + + adds r14, r14, r7 + adcs r19, r19, r9 + + mul r7, r1, r1 + umulh r1, r1, r1 + + adcs r13, r13, r11 + adcs r20, r20, r15 + + mul r9, r2, r2 + umulh r2, r2, r2 + + cinc r16, r16, cs + C 17, 6, 8, 10, 12, 14, 19, 13, 20, 16 + C 18, 0, 7, 1, 9, 2 + C Free: 11, 15 + + adds r17, r17, r17 + adcs r6, r6, r6 + + mul r11, r3, r3 + umulh r3, r3, r3 + + adcs r8, r8, r8 + adcs r10, r10, r10 + adcs r12, r12, r12 + adcs r14, r14, r14 + lsr r15, r16, #63 + adcs r19, r19, r19 + adcs r13, r13, r13 + adcs r20, r20, r20 + adc r16, r16, r16 + C 17, 6, 8, 10, 12, 14, 19, 13, 20, 16, 15 + C 18, 0, 7, 1, 9, 2, 11, 3 + C Free: + + adds r0, r0, r17 + + mul r17, r4, r4 + umulh r4, r4, r4 + + adcs r7, r7, r6 + adcs r1, r1, r8 + adcs r9, r9, r10 + + stp r18, r0, [rp] + + mul r6, r5, r5 + umulh r5, r5, r5 + + adcs r2, r2, r12 + adcs r11, r11, r14 + adcs r3, r3, r19 + + stp r7, r1, [rp, #2*8] + + adcs r13, r13, r17 + adcs r4, r4, r20 + + stp r9, r2, [rp, #4*8] + + ldp r19, r20, [sp, #2*8] + + stp r11, r3, [rp, #6*8] + + adcs r6, r6, r16 + adcs r5, r5, r15 + + stp r13, r4, [rp, #8*8] + + ldp r17, r18, [sp], #4*8 + + stp r6, r5, [rp, #10*8] + + mov res, r5 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_sqr_7) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + stp r17, r18, [sp, #-6*8]! + + ldr r6, [ap, #6*8] + stp r19, r20, [sp, #2*8] + stp r21, r22, [sp, #4*8] + + C mul rX, r0, r1 First computed when doubling + umulh r7, r0, r1 + mul r8, r0, r2 + umulh r9, r0, r2 + mul r10, r0, r3 + umulh r11, r0, r3 + mul r12, r0, r4 + umulh r13, r0, r4 + mul r14, r0, r5 + + adds r7, r7, r8 + + umulh r15, r0, r5 + mul r16, r0, r6 + + adcs r9, r9, r10 + + umulh r17, r0, r6 + C 7, 9, (11 + cs, 12), (13, 14), (15, 16), 17 + + mul r18, r1, r2 + + adcs r11, r11, r12 + + mul r19, r1, r3 + mul r20, r1, r4 + + adcs r13, r13, r14 + + mul r21, r1, r5 + mul r22, r1, r6 + + adcs r15, r15, r16 + + umulh r8, r1, r6 + umulh r10, r1, r2 + + cinc r17, r17, cs + + umulh r12, r1, r3 + umulh r14, r1, r4 + + adds r9, r9, r18 + + umulh r16, r1, r5 + mul r18, r2, r3 + C 7, 9, (11 + cs, 19, 10), (13, 20, 12, 18), (15, 21, 14), (17, 22, 16), 8 + C Free: + + adcs r11, r11, r19 + adcs r13, r13, r20 + + mul r19, r2, r4 + mul r20, r2, r5 + + adcs r15, r15, r21 + adcs r17, r17, r22 + + mul r21, r2, r6 + umulh r22, r2, r6 + + cinc r8, r8, cs + + adds r11, r11, r10 + adcs r13, r13, r12 + + umulh r10, r2, r3 + umulh r12, r2, r4 + + adcs r15, r15, r14 + adcs r17, r17, r16 + + umulh r14, r2, r5 + umulh r16, r3, r5 + + cinc r8, r8, cs + + adds r13, r13, r18 + adcs r15, r15, r19 + C 7, 9, 11, 13, (15, 10), (17 + cs, 20, 12), (8, 21, 14), (22, 16) + + umulh r18, r3, r6 + mul r19, r3, r4 + + adcs r17, r17, r20 + adcs r8, r8, r21 + + umulh r20, r3, r4 + mul r21, r3, r6 + + cinc r22, r22, cs + + adds r15, r15, r10 + adcs r17, r17, r12 + + umulh r10, r4, r5 + umulh r12, r4, r6 + + adcs r8, r8, r14 + adcs r22, r22, r16 + + mul r14, r3, r5 + mul r16, r4, r5 + + cinc r18, r18, cs + + adds r17, r17, r19 + adcs r8, r8, r20 + + mul r19, r4, r6 + mul r20, r5, r6 + + adcs r22, r22, r21 + adcs r18, r18, r10 + + umulh r21, r5, r6 + mul r10, r0, r1 + + cinc r12, r12, cs + C 10, 7, 9, 11, 13, 15, 17, (8, 14), (22, 16), (18, 19), (12, 20), 21 + + adds r8, r8, r14 + adcs r22, r22, r16 + + mul r14, r0, r0 + umulh r0, r0, r0 + + adcs r18, r18, r19 + adcs r12, r12, r20 + cinc r21, r21, cs + + mul r16, r1, r1 + umulh r1, r1, r1 + C 10, 7, 9, 11, 13, 15, 17, 8, 22, 18, 12, 21 + C 14, 0, 16, 1 + C Free: 19, 20 + + adds r10, r10, r10 + adcs r7, r7, r7 + adcs r9, r9, r9 + adcs r11, r11, r11 + adcs r13, r13, r13 + + mul r20, r2, r2 + umulh r2, r2, r2 + + adcs r15, r15, r15 + adcs r17, r17, r17 + adcs r8, r8, r8 + lsr r19, r21, #63 + adcs r22, r22, r22 + adcs r18, r18, r18 + adcs r12, r12, r12 + adc r21, r21, r21 + C 10, 7, 9, 11, 13, 15, 17, 8, 22, 18, 12, 21, 19 + C 14, 0, 16, 1, 20, 2 + C Free: + + adds r0, r0, r10 + + mul r10, r3, r3 + umulh r3, r3, r3 + + adcs r16, r16, r7 + adcs r1, r1, r9 + + stp r14, r0, [rp] + + mul r7, r4, r4 + umulh r4, r4, r4 + + adcs r20, r20, r11 + adcs r2, r2, r13 + + stp r16, r1, [rp, #2*8] + + mul r9, r5, r5 + umulh r5, r5, r5 + + adcs r10, r10, r15 + adcs r3, r3, r17 + + stp r20, r2, [rp, #4*8] + + mul r14, r6, r6 + umulh r6, r6, r6 + + adcs r7, r7, r8 + adcs r4, r4, r22 + + stp r10, r3, [rp, #6*8] + + adcs r9, r9, r18 + adcs r5, r5, r12 + adcs r14, r14, r21 + adcs r6, r6, r19 + + stp r7, r4, [rp, #8*8] + ldp r19, r20, [sp, #2*8] + stp r9, r5, [rp, #10*8] + ldp r21, r22, [sp, #4*8] + stp r14, r6, [rp, #12*8] + ldp r17, r18, [sp], #6*8 + + mov res, r6 + + ret +EPILOGUE() + +dnl From a{n - 4} forward, use the following sequence: +dnl +dnl umulh rX, r2, r4 +dnl umulh rX, r2, r5 +dnl mul rX, r2, r3 +dnl umulh rX, r2, r3 +dnl mul rX, r2, r5 +dnl umulh rX, r3, r4 +dnl umulh rX, r3, r5 +dnl mul rX, r2, r4 +dnl mul rX, r3, r4 +dnl mul rX, r3, r5 +dnl mul rX, r4, r5 +dnl umulh rX, r4, r5 +dnl mul rX, r0, r1 + +PROLOGUE(flint_mpn_sqr_8) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + stp r17, r18, [sp, #-10*8]! + stp r19, r20, [sp, #2*8] + ldp r6, r7, [ap, #6*8] + stp r21, r22, [sp, #4*8] + stp r23, r24, [sp, #6*8] + stp r25, r26, [sp, #8*8] + + C mul rX, r0, r1 Calculate this one later + umulh r8, r0, r1 + mul r9, r0, r2 + umulh r10, r0, r2 + mul r11, r0, r3 + umulh r12, r0, r3 + mul r13, r0, r4 + umulh r14, r0, r4 + mul r15, r0, r5 + + adds r8, r8, r9 + + umulh r16, r0, r5 + mul r17, r0, r6 + + adcs r10, r10, r11 + + umulh r18, r0, r6 + mul r19, r0, r7 + + adcs r12, r12, r13 + + umulh r20, r0, r7 + mul r21, r1, r2 + + adcs r14, r14, r15 + + mul r22, r1, r3 + mul r23, r1, r4 + + adcs r16, r16, r17 + + mul r24, r1, r5 + mul r25, r1, r6 + + adcs r18, r18, r19 + + mul r26, r1, r7 + umulh r9, r1, r7 + + cinc r20, r20, cs + C 8, (10, 21), (12, 22), (14, 23), (16, 24), (18, 25), (20, 26), 9 + C Free: 11, 13, 15, 17, 19 + + umulh r11, r1, r2 + umulh r13, r1, r3 + + adds r10, r10, r21 + adcs r12, r12, r22 + + umulh r15, r1, r4 + umulh r17, r1, r5 + + adcs r14, r14, r23 + adcs r16, r16, r24 + + umulh r19, r1, r6 + mul r21, r2, r3 + + adcs r18, r18, r25 + adcs r20, r20, r26 + + mul r22, r2, r4 + mul r23, r2, r5 + + cinc r9, r9, cs + C 8, 10, (12, 11), (14, 13, 21), (16, 15, 22), (18, 17, 23), (20, 19), 9 + C Free: 24, 25, 26 + + adds r12, r12, r11 + + mul r24, r2, r6 + mul r25, r2, r7 + + adcs r14, r14, r13 + adcs r16, r16, r15 + + umulh r26, r2, r7 + umulh r11, r2, r3 + + adcs r18, r18, r17 + adcs r20, r20, r19 + + umulh r13, r2, r4 + umulh r15, r2, r5 + + cinc r9, r9, cs + C 8, 10, 12, (14, 21), (16, 22, 11), (18, 23, 13), (20, 24, 15), (9, 25), 26 + C Free: 17, 19 + + adds r14, r14, r21 + + umulh r17, r2, r6 + mul r19, r3, r4 + + adcs r16, r16, r22 + adcs r18, r18, r23 + + mul r21, r3, r5 + mul r22, r3, r6 + + adcs r20, r20, r24 + adcs r9, r9, r25 + + mul r23, r3, r7 + umulh r24, r3, r7 + + cinc r26, r26, cs + C 8, 10, 12, 14, (16, 11), (18, 13, 19), (20, 15, 21), (9, 17, 22), (26, 23), 24 + C Free: 25 + + adds r16, r16, r11 + + umulh r25, r3, r4 + umulh r11, r3, r5 + + adcs r18, r18, r13 + adcs r20, r20, r15 + + umulh r13, r3, r6 + umulh r15, r4, r6 + + adcs r9, r9, r17 + cinc r26, r26, cs + C 8, 10, 12, 14, 16, (18, 19), (20, 21, 25), (9, 22, 11), (26, 23, 13), (24, 15) + C Free: 17 + + adds r18, r18, r19 + + umulh r17, r4, r7 + mul r19, r4, r5 + + adcs r20, r20, r21 + adcs r9, r9, r22 + + umulh r21, r4, r5 + mul r22, r4, r7 + + adcs r26, r26, r23 + cinc r24, r24, cs + C 8, 10, 12, 14, 16, 18, (20, 25), (9, 11, 19), (26, 13, 21), (24, 15, 22), 17 + C Free: 23 + + adds r20, r20, r25 + + umulh r23, r5, r6 + umulh r25, r5, r7 + + adcs r9, r9, r11 + adcs r26, r26, r13 + + mul r11, r4, r6 + mul r13, r5, r6 + + adcs r24, r24, r15 + cinc r17, r17, cs + C 8, 10, 12, 14, 16, 18, 20, (9, 19), (26, 21, 11), (24, 22, 13), (17, 23), 25 + C Free: 15 + + adds r9, r9, r19 + + mul r15, r5, r7 + mul r19, r6, r7 + + adcs r26, r26, r21 + adcs r24, r24, r22 + + umulh r21, r6, r7 + mul r22, r0, r1 + + adcs r17, r17, r23 + cinc r25, r25, cs + + mul r23, r0, r0 + umulh r0, r0, r0 + + adds r26, r26, r11 + adcs r24, r24, r13 + + mul r11, r1, r1 + umulh r1, r1, r1 + + adcs r17, r17, r15 + adcs r25, r25, r19 + + mul r13, r2, r2 + umulh r2, r2, r2 + + cinc r21, r21, cs + C 22, 8, 10, 12, 14, 16, 18, 20, 9, 26, 24, 17, 25, 21 + C 23, 0, 11, 1, 13, 2 + C Free: 15, 19 + + adds r22, r22, r22 + adcs r8, r8, r8 + adcs r10, r10, r10 + adcs r12, r12, r12 + adcs r14, r14, r14 + adcs r16, r16, r16 + + mul r19, r3, r3 + umulh r3, r3, r3 + + adcs r18, r18, r18 + adcs r20, r20, r20 + adcs r9, r9, r9 + lsr r15, r21, #63 + adcs r26, r26, r26 + adcs r24, r24, r24 + adcs r17, r17, r17 + adcs r25, r25, r25 + adc r21, r21, r21 + C 22, 8, 10, 12, 14, 16, 18, 20, 9, 26, 24, 17, 25, 21, 15 + C 23, 0, 11, 1, 13, 2, 19, 3 + C Free: + + adds r0, r0, r22 + + mul r22, r4, r4 + umulh r4, r4, r4 + + adcs r11, r11, r8 + adcs r1, r1, r10 + + stp r23, r0, [rp] + + mul r8, r5, r5 + umulh r5, r5, r5 + + adcs r13, r13, r12 + adcs r2, r2, r14 + + stp r11, r1, [rp, #2*8] + + mul r10, r6, r6 + umulh r6, r6, r6 + + adcs r16, r16, r19 + adcs r3, r3, r18 + + stp r13, r2, [rp, #4*8] + + mul r12, r7, r7 + umulh r7, r7, r7 + + adcs r22, r22, r20 + adcs r4, r4, r9 + + stp r16, r3, [rp, #6*8] + ldp r19, r20, [sp, #2*8] + + adcs r8, r8, r26 + adcs r5, r5, r24 + + stp r22, r4, [rp, #8*8] + ldp r23, r24, [sp, #6*8] + + adcs r10, r10, r17 + adcs r6, r6, r25 + + stp r8, r5, [rp, #10*8] + ldp r25, r26, [sp, #8*8] + + adcs r12, r12, r21 + adc r7, r7, r15 + + stp r10, r6, [rp, #12*8] + ldp r21, r22, [sp, #4*8] + stp r12, r7, [rp, #14*8] + ldp r17, r18, [sp], #10*8 + + mov res, r7 + + ret +EPILOGUE() + +PROLOGUE(flint_mpn_sqr_9) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + stp r17, r18, [sp, #-12*8]! + stp r19, r20, [sp, #2*8] + ldp r6, r7, [ap, #6*8] + ldr r8, [ap, #8*8] + stp r21, r22, [sp, #4*8] + stp r23, r24, [sp, #6*8] + stp r25, r26, [sp, #8*8] + str r27, [sp, #10*8] + + C mul rX, r0, r1 Calculated later + umulh r9, r0, r1 + mul r10, r0, r2 + umulh r11, r0, r2 + mul r12, r0, r3 + umulh r13, r0, r3 + mul r14, r0, r4 + umulh r15, r0, r4 + mul r16, r0, r5 + + adds r9, r9, r10 + + umulh r17, r0, r5 + mul r18, r0, r6 + + adcs r11, r11, r12 + + umulh r19, r0, r6 + mul r20, r0, r7 + + adcs r13, r13, r14 + + umulh r21, r0, r7 + mul r22, r0, r8 + + adcs r15, r15, r16 + + umulh r23, r0, r8 + mul r24, r1, r2 + + adcs r17, r17, r18 + + mul r25, r1, r3 + mul r26, r1, r4 + + adcs r19, r19, r20 + + mul r27, r1, r5 + mul r10, r1, r6 + + adcs r21, r21, r22 + + mul r12, r1, r7 + mul r14, r1, r8 + + cinc r23, r23, cs + C 9, (11, 24), (13, 25), (15, 26), (17, 27), (19, 10), (21, 12), (23, 14) + C Free: 16, 18, 20, 22 + + umulh r16, r1, r8 + umulh r18, r1, r2 + + adds r11, r11, r24 + adcs r13, r13, r25 + + umulh r20, r1, r3 + umulh r22, r1, r4 + + adcs r15, r15, r26 + adcs r17, r17, r27 + + umulh r24, r1, r5 + umulh r25, r1, r6 + + adcs r19, r19, r10 + adcs r21, r21, r12 + + umulh r26, r1, r7 + mul r27, r2, r3 + + adcs r23, r23, r14 + cinc r16, r16, cs + C 9, 11, (13, 18), (15, 20, 27), (17, 22), (19, 24), (21, 25), (23, 26), 16 + C Free: 10, 12, 14 + + mul r10, r2, r4 + mul r12, r2, r5 + + adds r13, r13, r18 + adcs r15, r15, r20 + + mul r14, r2, r6 + mul r18, r2, r7 + + adcs r17, r17, r22 + adcs r19, r19, r24 + + mul r20, r2, r8 + umulh r22, r2, r8 + + adcs r21, r21, r25 + adcs r23, r23, r26 + + umulh r24, r2, r3 + umulh r25, r2, r4 + + cinc r16, r16, cs + C 9, 11, 13, (15, 27), (17, 10, 24), (19, 12, 25), (21, 14), (23, 18), (16, 20), 22 + C Free: 26 + + adds r15, r15, r27 + + umulh r26, r2, r5 + umulh r27, r2, r6 + + adcs r17, r17, r10 + adcs r19, r19, r12 + + umulh r10, r2, r7 + mul r12, r3, r4 + + adcs r21, r21, r14 + adcs r23, r23, r18 + + mul r14, r3, r5 + mul r18, r3, r6 + + adcs r16, r16, r20 + cinc r22, r22, cs + C 9, 11, 13, 15, (17, 24), (19, 25, 12), (21, 26, 14), (23, 27, 18), (16, 10), 22 + C Free: 20 + + adds r17, r17, r24 + + mul r20, r3, r7 + mul r24, r3, r8 + + adcs r19, r19, r25 + adcs r21, r21, r26 + + umulh r25, r3, r8 + umulh r26, r3, r4 + + adcs r23, r23, r27 + adcs r16, r16, r10 + + umulh r27, r3, r5 + umulh r10, r3, r6 + + cinc r22, r22, cs + C 9, 11, 13, 15, 17, (19, 12), (21, 14, 26), (23, 18, 27), (16, 20, 10), (22, 24), 25 + + adds r19, r19, r12 + adcs r21, r21, r14 + + umulh r12, r3, r7 + mul r14, r4, r5 + + adcs r23, r23, r18 + adcs r16, r16, r20 + + mul r18, r4, r6 + mul r20, r4, r7 + + adcs r22, r22, r24 + cinc r25, r25, cs + C 9, 11, 13, 15, 17, 19, (21, 26), (23, 27, 14), (16, 10, 18), (22, 12, 20), 25 + + adds r21, r21, r26 + + mul r24, r4, r8 + umulh r26, r4, r8 + + adcs r23, r23, r27 + adcs r16, r16, r10 + + umulh r27, r4, r5 + umulh r10, r4, r6 + + adcs r22, r22, r12 + cinc r25, r25, cs + C 9, 11, 13, 15, 17, 19, 21, (23, 14), (16, 18, 27), (22, 20, 10), (25, 24), 26 + + adds r23, r23, r14 + + umulh r12, r4, r7 + umulh r14, r5, r7 + + adcs r16, r16, r18 + adcs r22, r22, r20 + + umulh r18, r5, r8 + mul r20, r5, r6 + + adcs r25, r25, r24 + cinc r26, r26, cs + C 9, 11, 13, 15, 17, 19, 21, 23, (16, 27), (22, 10, 20), (25, 12), (26, 14), 18 + + adds r16, r16, r27 + + umulh r24, r5, r6 + mul r27, r5, r8 + + adcs r22, r22, r10 + adcs r25, r25, r12 + + umulh r10, r6, r7 + umulh r12, r6, r8 + + adcs r26, r26, r14 + cinc r18, r18, cs + C 9, 11, 13, 15, 17, 19, 21, 23, 16, (22, 20), (25, 24), (26, 27), (18, 10), 12 + + adds r22, r22, r20 + + mul r14, r5, r7 + mul r20, r6, r7 + + adcs r25, r25, r24 + adcs r26, r26, r27 + + mul r24, r6, r8 + mul r27, r7, r8 + + adcs r18, r18, r10 + cinc r12, r12, cs + + umulh r10, r7, r8 + + adds r25, r25, r14 + + mul r14, r0, r1 + + adcs r26, r26, r20 + + mul r20, r0, r0 + umulh r0, r0, r0 + + adcs r18, r18, r24 + adcs r12, r12, r27 + + mul r24, r1, r1 + umulh r1, r1, r1 + + cinc r10, r10, cs + C 14, 9, 11, 13, 15, 17, 19, 21, 23, 16, 22, 25, 26, 18, 12, 10 + C 20, 0, 24, 1 + C Free: 27 + + adds r14, r14, r14 + adcs r9, r9, r9 + adcs r11, r11, r11 + adcs r13, r13, r13 + adcs r15, r15, r15 + adcs r17, r17, r17 + adcs r19, r19, r19 + adcs r21, r21, r21 + lsr r27, r10, #63 + adcs r23, r23, r23 + adcs r16, r16, r16 + adcs r22, r22, r22 + adcs r25, r25, r25 + adcs r26, r26, r26 + adcs r18, r18, r18 + adcs r12, r12, r12 + adc r10, r10, r10 + C 14, 9, 11, 13, 15, 17, 19, 21, 23, 16, 22, 25, 26, 18, 12, 10, 27 + C 20, 0, 24, 1 + C Free: + + adds r0, r0, r14 + + mul r14, r2, r2 + umulh r2, r2, r2 + + adcs r24, r24, r9 + adcs r1, r1, r11 + + mul r9, r3, r3 + umulh r3, r3, r3 + + stp r20, r0, [rp] + + mul r11, r4, r4 + umulh r4, r4, r4 + + stp r24, r1, [rp, #2*8] + + adcs r14, r14, r13 + adcs r2, r2, r15 + + mul r0, r5, r5 + umulh r5, r5, r5 + + mul r1, r6, r6 + umulh r6, r6, r6 + + adcs r9, r9, r17 + adcs r3, r3, r19 + + stp r14, r2, [rp, #4*8] + + mul r13, r7, r7 + umulh r7, r7, r7 + + adcs r11, r11, r21 + adcs r4, r4, r23 + + ldp r19, r20, [sp, #2*8] + stp r9, r3, [rp, #6*8] + + mul r15, r8, r8 + umulh r8, r8, r8 + + adcs r0, r0, r16 + adcs r5, r5, r22 + + ldp r21, r22, [sp, #4*8] + + adcs r1, r1, r25 + adcs r6, r6, r26 + + stp r11, r4, [rp, #8*8] + ldp r23, r24, [sp, #6*8] + + adcs r13, r13, r18 + adcs r7, r7, r12 + + stp r0, r5, [rp, #10*8] + ldp r25, r26, [sp, #8*8] + + adcs r15, r15, r10 + adc r8, r8, r27 + + stp r1, r6, [rp, #12*8] + ldr r27, [sp, #10*8] + stp r13, r7, [rp, #14*8] + ldp r17, r18, [sp], #12*8 + stp r15, r8, [rp, #16*8] + + mov res, r8 + + ret +EPILOGUE() diff --git a/src/mpn_extras/arm64/sqrhigh_hard.asm b/src/mpn_extras/arm64/sqrhigh_hard.asm new file mode 100644 index 0000000000..8e5275b286 --- /dev/null +++ b/src/mpn_extras/arm64/sqrhigh_hard.asm @@ -0,0 +1,528 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +define(`rp', `x0') +define(`ap', `x1') + +define(`r0', `x2') +define(`r1', `x3') +define(`r2', `x4') +define(`r3', `x5') +define(`r4', `x6') +define(`r5', `x7') +define(`r6', `x8') +define(`r7', `x9') +define(`r8', `x10') +define(`r9', `x11') +define(`r10', `x12') +define(`r11', `x13') +define(`r12', `x14') +define(`r13', `x15') +define(`r14', `x16') +define(`r15', `x17') + +define(`r16', `ap') dnl Beware! + +dnl NOTE: The following has to be pushed and popped +dnl NOTE: At least on Apple, we need to preserve both x18 and x29 at all times! +define(`r17', `x19') +define(`r18', `x20') +define(`r19', `x21') +define(`r20', `x22') +define(`r21', `x23') +define(`r22', `x24') +define(`r23', `x25') +define(`r24', `x26') +define(`r25', `x27') +define(`r26', `x28') +define(`r27', `x30') + +define(`res', `rp') dnl NOTE: Synonymous with rp! + +PROLOGUE(flint_mpn_sqrhigh_1) + ldr r0, [ap] + umulh r1, r0, r0 + mul r0, r0, r0 + str r1, [rp] + mov res, r0 + ret +EPILOGUE() + +PROLOGUE(flint_mpn_sqrhigh_2) + ldp r0, r1, [ap] + umulh r2, r0, r0 + mul r3, r0, r1 + umulh r4, r0, r1 + mul r5, r1, r1 + umulh r6, r1, r1 + adds r2, r2, r3 + adcs r5, r5, r4 + cinc r6, r6, cs + adds r2, r2, r3 + adcs r5, r5, r4 + cinc r6, r6, cs + stp r5, r6, [rp] + mov res, r2 + ret +EPILOGUE() + +C 0 1 2 +C 1 h x +C 2 d x +C 3 d +PROLOGUE(flint_mpn_sqrhigh_3) + ldp r0, r1, [ap] + ldr r2, [ap, #2*8] + umulh r3, r0, r1 + mul r4, r0, r2 + umulh r0, r0, r2 + mul r5, r1, r2 + umulh r6, r1, r2 + mul r7, r1, r1 + adds r3, r3, r4 + umulh r1, r1, r1 + mul r8, r2, r2 + adcs r0, r0, r5 + umulh r2, r2, r2 + cinc r6, r6, cs + adds r3, r3, r3 + adcs r0, r0, r0 + adcs r6, r6, r6 + cset r4, cs + adds r7, r7, r3 + adcs r1, r1, r0 + adcs r8, r8, r6 + adc r2, r2, r4 + stp r1, r8, [rp] + str r2, [rp, #2*8] + mov res, r7 + ret +EPILOGUE() + +C 0 1 2 3 +C 0 h x +C 1 e x x +C 2 d x +C 3 d +PROLOGUE(flint_mpn_sqrhigh_4) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + umulh r4, r0, r2 + mul r5, r0, r3 + umulh r6, r0, r3 + umulh r7, r1, r2 + umulh r8, r1, r3 + mul r9, r1, r2 + adds r4, r4, r5 + mul r10, r1, r3 + mul r11, r2, r3 + adcs r6, r6, r7 + umulh r12, r2, r3 + umulh r1, r1, r1 + cinc r8, r8, cs + mul r13, r2, r2 + umulh r2, r2, r2 + adds r4, r4, r9 + mul r14, r3, r3 + umulh r3, r3, r3 + adcs r6, r6, r10 + adcs r8, r8, r11 + cinc r12, r12, cs + adds r4, r4, r4 + adcs r6, r6, r6 + adcs r8, r8, r8 + adcs r12, r12, r12 + cset r0, cs + adds r4, r4, r1 + adcs r6, r6, r13 + adcs r8, r8, r2 + adcs r12, r12, r14 + adc r0, r0, r3 + stp r6, r8, [rp] + stp r12, r0, [rp, #2*8] + mov res, r4 + + ret +EPILOGUE() + +C 0 1 2 3 4 +C 0 h x +C 1 h x x +C 2 d x x +C 3 d x +C 4 d +PROLOGUE(flint_mpn_sqrhigh_5) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldr r4, [ap, #4*8] + + umulh r5, r0, r3 + mul r6, r0, r4 + umulh r7, r0, r4 + umulh r8, r1, r3 + umulh r9, r1, r4 + umulh r10, r1, r2 + adds r5, r5, r6 + mul r11, r1, r4 + umulh r12, r2, r3 + adcs r7, r7, r8 + umulh r13, r2, r4 + mul r14, r1, r3 + cinc r9, r9, cs + mul r15, r2, r3 + mul r16, r2, r4 + adds r5, r5, r10 + adcs r7, r7, r11 + mul r6, r3, r4 + umulh r8, r3, r4 + adcs r9, r9, r12 + cinc r13, r13, cs + mul r0, r2, r2 + umulh r2, r2, r2 + adds r5, r5, r14 + adcs r7, r7, r15 + mul r1, r3, r3 + umulh r3, r3, r3 + adcs r9, r9, r16 + adcs r13, r13, r6 + mul r10, r4, r4 + umulh r4, r4, r4 + cinc r8, r8, cs + adds r5, r5, r5 + adcs r7, r7, r7 + adcs r9, r9, r9 + adcs r13, r13, r13 + adcs r8, r8, r8 + cset r11, cs + adds r5, r5, r0 + adcs r7, r7, r2 + adcs r9, r9, r1 + adcs r13, r13, r3 + stp r7, r9, [rp] + adcs r8, r8, r10 + stp r13, r8, [rp, #2*8] + adc r11, r11, r4 + str r11, [rp, #4*8] + mov res, r5 + + ret +EPILOGUE() + +C 0 1 2 3 4 5 +C 0 h x +C 1 h x x +C 2 e x x x +C 3 d x x +C 4 d x +C 5 d +PROLOGUE(flint_mpn_sqrhigh_6) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + umulh r6, r1, r3 + mul r7, r2, r3 + umulh r8, r2, r3 + umulh r9, r1, r4 + umulh r10, r2, r4 + umulh r11, r0, r4 + adds r6, r6, r7 + mul r12, r2, r4 + mul r13, r3, r4 + adcs r8, r8, r9 + umulh r14, r3, r4 + mul r15, r1, r4 + cinc r10, r10, cs + umulh r16, r0, r5 + adds r6, r6, r11 + umulh r7, r1, r5 + adcs r8, r8, r12 + umulh r9, r2, r5 + adcs r10, r10, r13 + umulh r11, r3, r5 + cinc r14, r14, cs + mul r0, r0, r5 + adds r6, r6, r15 + mul r1, r1, r5 + adcs r8, r8, r16 + mul r12, r2, r5 + adcs r10, r10, r7 + mul r13, r3, r5 + adcs r14, r14, r9 + mul r15, r4, r5 + cinc r11, r11, cs + adds r6, r6, r0 + umulh r16, r4, r5 + adcs r8, r8, r1 + umulh r2, r2, r2 + adcs r10, r10, r12 + mul r7, r3, r3 + adcs r14, r14, r13 + umulh r3, r3, r3 + adcs r11, r11, r15 + mul r9, r4, r4 + cinc r16, r16, cs + umulh r4, r4, r4 + adds r6, r6, r6 + adcs r8, r8, r8 + mul r0, r5, r5 + adcs r10, r10, r10 + adcs r14, r14, r14 + umulh r5, r5, r5 + adcs r11, r11, r11 + adcs r16, r16, r16 + cset r1, cs + adds r2, r2, r6 + adcs r7, r7, r8 + adcs r3, r3, r10 + stp r7, r3, [rp] + adcs r9, r9, r14 + adcs r4, r4, r11 + stp r9, r4, [rp, #2*8] + adcs r0, r0, r16 + adc r5, r5, r1 + stp r0, r5, [rp, #4*8] + mov res, r2 + + ret +EPILOGUE() + +C 0 1 2 3 4 5 6 +C 0 h x +C 1 h x x +C 2 h x x x +C 3 d x x x +C 4 d x x +C 5 d x +C 6 d +PROLOGUE(flint_mpn_sqrhigh_7) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + ldr r6, [ap, #6*8] + + umulh r7, r2, r3 + + umulh r8, r1, r4 + umulh r9, r2, r4 + mul r10, r3, r4 + umulh r11, r3, r4 + mul r12, r2, r4 + + adds r7, r7, r8 + umulh r13, r1, r5 + umulh r14, r2, r5 + adcs r9, r9, r10 + umulh r15, r3, r5 + cinc r11, r11, cs + umulh r16, r0, r5 + adds r7, r7, r12 + mul r8, r2, r5 + adcs r9, r9, r13 + mul r10, r3, r5 + adcs r11, r11, r14 + mul r12, r4, r5 + cinc r15, r15, cs + umulh r13, r4, r5 + + adds r7, r7, r16 + mul r14, r1, r5 + + adcs r9, r9, r8 + umulh r16, r0, r6 + adcs r11, r11, r10 + umulh r8, r1, r6 + adcs r15, r15, r12 + umulh r10, r2, r6 + cinc r13, r13, cs + umulh r12, r3, r6 + adds r7, r7, r14 + umulh r14, r4, r6 + + adcs r9, r9, r16 + mul r0, r0, r6 + adcs r11, r11, r8 + mul r1, r1, r6 + adcs r15, r15, r10 + mul r2, r2, r6 + adcs r13, r13, r12 + mul r16, r3, r6 + cinc r14, r14, cs + mul r8, r4, r6 + adds r7, r7, r0 + mul r10, r5, r6 + adcs r9, r9, r1 + umulh r12, r5, r6 + + mul r0, r3, r3 + adcs r11, r11, r2 + umulh r3, r3, r3 + adcs r15, r15, r16 + mul r1, r4, r4 + adcs r13, r13, r8 + umulh r4, r4, r4 + adcs r14, r14, r10 + mul r2, r5, r5 + cinc r12, r12, cs + umulh r5, r5, r5 + adds r7, r7, r7 + adcs r9, r9, r9 + mul r16, r6, r6 + adcs r11, r11, r11 + adcs r15, r15, r15 + umulh r6, r6, r6 + adcs r13, r13, r13 + adcs r14, r14, r14 + adcs r12, r12, r12 + cset r8, cs + + adds r0, r0, r7 + adcs r3, r3, r9 + adcs r1, r1, r11 + stp r3, r1, [rp] + adcs r4, r4, r15 + adcs r2, r2, r13 + stp r4, r2, [rp, #2*8] + adcs r5, r5, r14 + adcs r16, r16, r12 + stp r5, r16, [rp, #4*8] + adc r6, r6, r8 + str r6, [rp, #6*8] + mov res, r0 + + ret +EPILOGUE() + +C 0 1 2 3 4 5 6 7 +C 0 h x +C 1 h x x +C 2 h x x x +C 3 e x x x x +C 4 d x x x +C 5 d x x +C 6 d x +C 7 d +PROLOGUE(flint_mpn_sqrhigh_8) + ldp r0, r1, [ap] + ldp r2, r3, [ap, #2*8] + ldp r4, r5, [ap, #4*8] + ldp r6, r7, [ap, #6*8] + + umulh r8, r2, r4 + mul r9, r3, r4 + umulh r10, r3, r4 + + umulh r11, r2, r5 + umulh r12, r3, r5 + umulh r13, r1, r5 + adds r8, r8, r9 + mul r14, r3, r5 + mul r15, r4, r5 + adcs r10, r10, r11 + umulh r16, r4, r5 + cinc r12, r12, cs + mul r9, r2, r5 + + adds r8, r8, r13 + umulh r11, r1, r6 + adcs r10, r10, r14 + umulh r13, r2, r6 + adcs r12, r12, r15 + umulh r14, r3, r6 + cinc r16, r16, cs + umulh r15, r4, r6 + adds r8, r8, r9 + mul r9, r1, r6 + adcs r10, r10, r11 + mul r11, r2, r6 + adcs r12, r12, r13 + mul r13, r3, r6 + adcs r16, r16, r14 + mul r14, r4, r6 + cinc r15, r15, cs + adds r8, r8, r9 + mul r9, r5, r6 + adcs r10, r10, r11 + umulh r11, r5, r6 + adcs r12, r12, r13 + umulh r13, r0, r6 + + adcs r16, r16, r14 + umulh r14, r0, r7 + adcs r15, r15, r9 + umulh r9, r1, r7 + cinc r11, r11, cs + mul r0, r0, r7 C Reduce latency + adds r8, r8, r13 + umulh r13, r2, r7 + adcs r10, r10, r14 + umulh r14, r3, r7 + adcs r12, r12, r9 + umulh r9, r4, r7 + adcs r16, r16, r13 + umulh r13, r5, r7 + mul r1, r1, r7 + adcs r15, r15, r14 + mul r2, r2, r7 + adcs r11, r11, r9 + mul r14, r3, r7 + cinc r13, r13, cs + mul r9, r4, r7 + adds r8, r8, r0 + mul r0, r5, r7 + adcs r10, r10, r1 + mul r1, r6, r7 + adcs r12, r12, r2 + umulh r2, r6, r7 + + adcs r16, r16, r14 + umulh r3, r3, r3 + adcs r15, r15, r9 + mul r14, r4, r4 + adcs r11, r11, r0 + umulh r4, r4, r4 + adcs r13, r13, r1 + mul r9, r5, r5 + cinc r2, r2, cs + umulh r5, r5, r5 + + adds r8, r8, r8 + mul r0, r6, r6 + adcs r10, r10, r10 + umulh r6, r6, r6 + adcs r12, r12, r12 + adcs r16, r16, r16 + adcs r15, r15, r15 + adcs r11, r11, r11 + adcs r13, r13, r13 + adcs r2, r2, r2 + cset r1, cs + + adds r3, r3, r8 + mul r8, r7, r7 + adcs r14, r14, r10 + umulh r7, r7, r7 + adcs r4, r4, r12 + stp r14, r4, [rp] + adcs r9, r9, r16 + adcs r5, r5, r15 + stp r9, r5, [rp, #2*8] + adcs r0, r0, r11 + adcs r6, r6, r13 + stp r0, r6, [rp, #4*8] + adcs r8, r8, r2 + adc r7, r7, r1 + stp r8, r7, [rp, #6*8] + mov res, r3 + + ret +EPILOGUE() diff --git a/src/mpn_extras/mulhigh_basecase.c b/src/mpn_extras/mulhigh_basecase.c index f38249e1a9..0f4adc0c3c 100644 --- a/src/mpn_extras/mulhigh_basecase.c +++ b/src/mpn_extras/mulhigh_basecase.c @@ -82,7 +82,55 @@ const flint_mpn_sqr_func_t flint_mpn_sqrhigh_func_tab[] = flint_mpn_sqrhigh_7, flint_mpn_sqrhigh_8 }; +#elif FLINT_HAVE_ASSEMBLY_armv8 +mp_limb_t flint_mpn_mulhigh_1(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_2(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_3(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_4(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_5(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_6(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_7(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mulhigh_8(mp_ptr, mp_srcptr, mp_srcptr); + +mp_limb_t flint_mpn_sqrhigh_1(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqrhigh_2(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqrhigh_3(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqrhigh_4(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqrhigh_5(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqrhigh_6(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqrhigh_7(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqrhigh_8(mp_ptr, mp_srcptr); + +const flint_mpn_mul_func_t flint_mpn_mulhigh_func_tab[] = +{ + NULL, + flint_mpn_mulhigh_1, + flint_mpn_mulhigh_2, + flint_mpn_mulhigh_3, + flint_mpn_mulhigh_4, + flint_mpn_mulhigh_5, + flint_mpn_mulhigh_6, + flint_mpn_mulhigh_7, + flint_mpn_mulhigh_8, +}; + +const flint_mpn_mulhigh_normalised_func_t flint_mpn_mulhigh_normalised_func_tab[] = +{ + NULL, +}; +const flint_mpn_sqr_func_t flint_mpn_sqrhigh_func_tab[] = +{ + NULL, + flint_mpn_sqrhigh_1, + flint_mpn_sqrhigh_2, + flint_mpn_sqrhigh_3, + flint_mpn_sqrhigh_4, + flint_mpn_sqrhigh_5, + flint_mpn_sqrhigh_6, + flint_mpn_sqrhigh_7, + flint_mpn_sqrhigh_8, +}; #else /* todo: add MPFR-like basecase for use in mulders */ diff --git a/src/mpn_extras/profile/p-mul.c b/src/mpn_extras/profile/p-mul.c index c0b29a47c9..2cdf461e75 100644 --- a/src/mpn_extras/profile/p-mul.c +++ b/src/mpn_extras/profile/p-mul.c @@ -16,24 +16,42 @@ #define N_MAX 16 #define N_MAX2 256 -static void measure(mp_ptr rpg, mp_ptr rpf, mp_ptr xp, mp_ptr yp, slong m) +static void measure(mp_ptr rpg, mp_ptr rpf, mp_ptr xp, mp_ptr yp, slong m, slong n_max) { double t1, t2, FLINT_SET_BUT_UNUSED(__); slong n; flint_printf("m = %3wd:", m); - for (n = 1; n <= m; n = (n <= N_MAX ? n + 1 : n * 1.2)) + for (n = 1; n <= n_max; n = (n <= N_MAX ? n + 1 : n * 1.2)) { - mpn_random2(xp, m); - mpn_random2(yp, n); + mp_ptr ap, bp; + slong mt, nt; + + if (m >= n) + { + mt = m; + nt = n; + ap = xp; + bp = yp; + } + else + { + mt = n; + nt = m; + ap = yp; + bp = xp; + } + + mpn_random2(ap, mt); + mpn_random2(bp, nt); TIMEIT_START - mpn_mul(rpg, xp, m, yp, n); + mpn_mul(rpg, ap, mt, bp, nt); TIMEIT_STOP_VALUES(__, t1) TIMEIT_START - flint_mpn_mul(rpf, xp, m, yp, n); + flint_mpn_mul(rpf, ap, mt, bp, nt); TIMEIT_STOP_VALUES(__, t2) flint_printf(" %.2f", t1 / t2); @@ -44,21 +62,42 @@ static void measure(mp_ptr rpg, mp_ptr rpf, mp_ptr xp, mp_ptr yp, slong m) int main(int argc, char ** argv) { - mp_limb_t xp[N_MAX2], yp[N_MAX2], rpf[2 * N_MAX2], rpg[2 * N_MAX2]; + mp_ptr xp, yp, rpf, rpg; slong m; + xp = flint_malloc(sizeof(mp_limb_t) * N_MAX2); + yp = flint_malloc(sizeof(mp_limb_t) * N_MAX2); + rpf = flint_malloc(2 * sizeof(mp_limb_t) * N_MAX2); + rpg = flint_malloc(2 * sizeof(mp_limb_t) * N_MAX2); + if (argc == 2) { - measure(rpg, rpf, xp, yp, strtol(argv[1], NULL, 10)); + measure(rpg, rpf, xp, yp, strtol(argv[1], NULL, 10), N_MAX2); + goto end; + } + else if (argc == 3) + { + slong m = strtol(argv[1], NULL, 10); + slong n_max = strtol(argv[2], NULL, 10); + xp = flint_realloc(xp, sizeof(mp_limb_t) * m); + yp = flint_realloc(yp, sizeof(mp_limb_t) * n_max); + rpf = flint_realloc(rpf, sizeof(mp_limb_t) * (m + n_max)); + rpg = flint_realloc(rpg, sizeof(mp_limb_t) * (m + n_max)); + measure(rpg, rpf, xp, yp, m, n_max); goto end; } flint_printf("mpn_mul vs flint_mpn_mul\n\n"); for (m = 1; m <= N_MAX2; m = (m <= N_MAX ? m + 1 : m * 1.2)) - measure(rpg, rpf, xp, yp, m); + measure(rpg, rpf, xp, yp, m, m); end: + flint_free(xp); + flint_free(yp); + flint_free(rpf); + flint_free(rpg); + flint_cleanup_master(); return 0; } diff --git a/src/mpn_extras/sqr_basecase.c b/src/mpn_extras/sqr_basecase.c index 8591e28598..5feab800a4 100644 --- a/src/mpn_extras/sqr_basecase.c +++ b/src/mpn_extras/sqr_basecase.c @@ -99,6 +99,30 @@ const flint_mpn_sqr_func_t flint_mpn_sqr_func_tab[] = { flint_mpn_sqr_14 }; +#elif FLINT_HAVE_ASSEMBLY_armv8 +mp_limb_t flint_mpn_sqr_1(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqr_2(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqr_3(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqr_4(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqr_5(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqr_6(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqr_7(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqr_8(mp_ptr, mp_srcptr); +mp_limb_t flint_mpn_sqr_9(mp_ptr, mp_srcptr); + +const flint_mpn_sqr_func_t flint_mpn_sqr_func_tab[] = +{ + NULL, + flint_mpn_sqr_1, + flint_mpn_sqr_2, + flint_mpn_sqr_3, + flint_mpn_sqr_4, + flint_mpn_sqr_5, + flint_mpn_sqr_6, + flint_mpn_sqr_7, + flint_mpn_sqr_8, + flint_mpn_sqr_9 +}; #else /* Currently generic C code performs worse than GMP. */ diff --git a/src/mpn_extras/sqrhigh.c b/src/mpn_extras/sqrhigh.c index baeb944f40..e6d650e424 100644 --- a/src/mpn_extras/sqrhigh.c +++ b/src/mpn_extras/sqrhigh.c @@ -17,8 +17,6 @@ #include "mpn_extras.h" -#if FLINT_HAVE_NATIVE_mpn_mulhigh_basecase - /* Generated by tune-sqrhigh.c */ static const signed short flint_mpn_sqrhigh_k_tab[FLINT_MPN_SQRHIGH_K_TAB_SIZE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -167,16 +165,6 @@ _flint_mpn_sqrhigh_sqr(mp_ptr res, mp_srcptr u, mp_size_t n) return bot; } -mp_limb_t flint_mpn_sqrhigh_basecase(mp_ptr rp, mp_srcptr xp, mp_size_t n) -{ - FLINT_ASSERT(n >= 1); - - if (FLINT_HAVE_SQRHIGH_FUNC(n)) /* NOTE: Aliasing allowed here */ - return flint_mpn_sqrhigh_func_tab[n](rp, xp); - else - return _flint_mpn_sqrhigh_basecase(rp, xp, n); -} - mp_limb_t _flint_mpn_sqrhigh(mp_ptr res, mp_srcptr u, mp_size_t n) { @@ -187,7 +175,3 @@ _flint_mpn_sqrhigh(mp_ptr res, mp_srcptr u, mp_size_t n) else return _flint_mpn_sqrhigh_sqr(res, u, n); } - -#else -typedef int this_file_is_empty; -#endif diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c index e28b927bd1..999280ff88 100644 --- a/src/mpn_extras/test/main.c +++ b/src/mpn_extras/test/main.c @@ -25,14 +25,14 @@ #include "t-mul_n.c" #include "t-mul_toom22.c" #include "t-mullow_n.c" -#include "t-mulhigh.c" +#include "t-mulhigh_n.c" #include "t-mulhigh_normalised.c" #include "t-mulmod_2expp1.c" #include "t-mulmod_preinv1.c" #include "t-mulmod_preinvn.c" #include "t-remove_2exp.c" #include "t-remove_power.c" -#include "t-sqrhigh_basecase.c" +#include "t-sqr.c" #include "t-sqrhigh.c" /* Array of test functions ***************************************************/ @@ -50,14 +50,14 @@ test_struct tests[] = TEST_FUNCTION(flint_mpn_mul_n), TEST_FUNCTION(flint_mpn_mul_toom22), TEST_FUNCTION(flint_mpn_mullow_n), - TEST_FUNCTION(flint_mpn_mulhigh), + TEST_FUNCTION(flint_mpn_mulhigh_n), TEST_FUNCTION(flint_mpn_mulhigh_normalised), TEST_FUNCTION(flint_mpn_mulmod_2expp1), TEST_FUNCTION(flint_mpn_mulmod_preinv1), TEST_FUNCTION(flint_mpn_mulmod_preinvn), TEST_FUNCTION(flint_mpn_remove_2exp), TEST_FUNCTION(flint_mpn_remove_power), - TEST_FUNCTION(flint_mpn_sqrhigh_basecase), + TEST_FUNCTION(flint_mpn_sqr), TEST_FUNCTION(flint_mpn_sqrhigh) }; diff --git a/src/mpn_extras/test/t-mul.c b/src/mpn_extras/test/t-mul.c index e3187d7b48..f1981cce47 100644 --- a/src/mpn_extras/test/t-mul.c +++ b/src/mpn_extras/test/t-mul.c @@ -72,3 +72,6 @@ TEST_FUNCTION_START(flint_mpn_mul, state) #undef N_MIN #undef N_MAX +#undef WANT_BIG +#undef N_MIN_STOR +#undef N_MAX_STOR diff --git a/src/mpn_extras/test/t-mulhigh.c b/src/mpn_extras/test/t-mulhigh.c deleted file mode 100644 index fba5b702bd..0000000000 --- a/src/mpn_extras/test/t-mulhigh.c +++ /dev/null @@ -1,113 +0,0 @@ -/* - Copyright (C) 2024 Albin Ahlbäck - - This file is part of FLINT. - - FLINT is free software: you can redistribute it and/or modify it under - the terms of the GNU Lesser General Public License (LGPL) as published - by the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. See . -*/ - -#include "test_helpers.h" -#include "mpn_extras.h" - -/* TODO: Remove this preprocessor conditional */ -#if FLINT_HAVE_NATIVE_mpn_mulhigh_basecase - -#define N_MAX (FLINT_MPN_MULHIGH_MUL_CUTOFF + 100) - -#define rpcH (rpc + n - 1) - -/* Defined in t-mulhigh.c and t-sqrhigh.c */ -#ifndef lower_bound -# define lower_bound lower_bound -static ulong lower_bound(ulong n) -{ - /* These are calculated by hand lower bound for the returned limb */ - if (n < 3) - return 0; - else - return 4 * n - 8; -} -#endif - -TEST_FUNCTION_START(flint_mpn_mulhigh, state) -{ - slong ix; - int result; - - mp_ptr rp, rpc, xp, yp; - - rp = flint_malloc(sizeof(mp_limb_t) * N_MAX); - rpc = flint_malloc(2 * sizeof(mp_limb_t) * N_MAX); - xp = flint_malloc(sizeof(mp_limb_t) * N_MAX); - yp = flint_malloc(sizeof(mp_limb_t) * N_MAX); - - for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++) - { - mp_limb_t borrow; - mp_size_t n; - - /* Trigger full multiplication in mulhigh */ - if (n_randint(state, 1000) == 0) - n = FLINT_MPN_MULHIGH_MUL_CUTOFF + n_randint(state, 50); - else if (n_randint(state, 100) == 0) - n = 1 + n_randint(state, FLINT_MPN_MULHIGH_MUL_CUTOFF); - else - n = 1 + n_randint(state, 2 * FLINT_MPN_MULHIGH_MULDERS_CUTOFF); - - mpn_random2(xp, n); - mpn_random2(yp, n); - - rp[0] = flint_mpn_mulhigh_n(rp + 1, xp, yp, n); - - /* Check upper bound */ - flint_mpn_mul_n(rpc, xp, yp, n); - - result = (mpn_cmp(rp, rpcH, n + 1) <= 0); - if (!result) - TEST_FUNCTION_FAIL( - "Bigger than upper bound!\n" - "ix = %wd\n" - "n = %wd\n" - "xp = %{ulong*}\n" - "yp = %{ulong*}\n" - "Upper bound: %{ulong*}\n" - "Got: %{ulong*}\n", - ix, n, xp, n, yp, n, rpcH, n + 1, rp, n + 1); - - /* Check lower bound */ - borrow = mpn_sub_1(rpcH, rpcH, n + 1, lower_bound(n)); - if (borrow) - mpn_zero(rpcH, n + 1); - - result = (mpn_cmp(rp, rpcH, n + 1) >= 0); - if (!result) - TEST_FUNCTION_FAIL( - "Smaller than lower bound!\n" - "ix = %wd\n" - "n = %wd\n" - "xp = %{ulong*}\n" - "yp = %{ulong*}\n" - "Lower bound: %{ulong*}\n" - "Got: %{ulong*}\n", - ix, n, xp, n, yp, n, rpcH, n + 1, rp, n + 1); - } - - flint_free(rp); - flint_free(rpc); - flint_free(xp); - flint_free(yp); - - TEST_FUNCTION_END(state); -} -# undef N_MIN -# undef N_MAX -# undef N_MAX2 -#else -TEST_FUNCTION_START(flint_mpn_mulhigh, state) -{ - TEST_FUNCTION_END_SKIPPED(state); -} -#endif diff --git a/src/mpn_extras/test/t-mulhigh_n.c b/src/mpn_extras/test/t-mulhigh_n.c new file mode 100644 index 0000000000..06a42763bd --- /dev/null +++ b/src/mpn_extras/test/t-mulhigh_n.c @@ -0,0 +1,202 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + Copyright (C) 2024 Fredrik Johansson + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "mpn_extras.h" + +#define N_MIN 1 + +#define WANT_STOR 1 + +#if WANT_STOR +# define MAX_ALLOC_SIZE (FLINT_MPN_MULHIGH_MUL_CUTOFF + 50) +#else +# define N_MAX FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH +# define MAX_ALLOC_SIZE (2 * N_MAX) +#endif + +#define GET_N_FULL_MUL(state) (N_MIN + FLINT_MPN_MULHIGH_MUL_CUTOFF + n_randint(state, 50 - N_MIN + 1)) +#define GET_N_LARGE(state) (N_MIN + n_randint(state, FLINT_MPN_MULHIGH_MUL_CUTOFF - N_MIN + 1)) +#define GET_N_SMALL(state) (N_MIN + n_randint(state, 2 * FLINT_MPN_MULHIGH_MULDERS_CUTOFF - N_MIN + 1)) + +#define rpcH (rpc + n - 1) + +/* Defined in t-mulhigh.c and t-sqrhigh.c */ +#ifndef generic_mulhigh_basecase +# define generic_mulhigh_basecase generic_mulhigh_basecase +static mp_limb_t +generic_mulhigh_basecase(mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t low; + mp_limb_t a, b; + + if (n == 1) + { + umul_ppmm(rp[0], low, up[0], vp[0]); + } + else if (n == 2) + { + FLINT_MPN_MUL_2X2(rp[1], rp[0], low, b, up[1], up[0], vp[1], vp[0]); + } + else if (n == 3) + { + NN_DOTREV_S3_1X1_HIGH(b, a, up, vp, 2); + NN_DOTREV_S3_A3_1X1(b, a, low, 0, b, a, up, vp, 3); + NN_DOTREV_S3_A3_1X1(b, a, rp[0], 0, b, a, up + 1, vp + 1, 2); + NN_ADDMUL_S2_A2_1X1(rp[2], rp[1], b, a, up[2], vp[2]); + } + else + { + mp_limb_t tp[2]; + slong ix; + + NN_DOTREV_S3_1X1_HIGH(b, a, up, vp, n - 1); + NN_DOTREV_S3_A3_1X1(b, a, low, 0, b, a, up, vp, n); + tp[0] = a; + tp[1] = b; + + umul_ppmm(rp[1], rp[0], up[n - 1], vp[1]); + for (ix = 2; ix < n; ix++) + rp[ix] = mpn_addmul_1(rp, up + n - ix, ix, vp[ix]); + + mpn_add(rp, rp, n, tp, 2); + } + + return low; +} +#endif + +/* Defined in t-mulhigh.c and t-sqrhigh.c */ +#ifndef lower_bound +# define lower_bound lower_bound +static ulong lower_bound(ulong n) +{ + /* These are calculated by hand lower bound for the returned limb */ + if (n < 3) + return 0; + else + return 4 * n - 8; +} +#endif + +TEST_FUNCTION_START(flint_mpn_mulhigh_n, state) +{ + slong ix; + int result; + mp_ptr rp, rpc, xp, yp; + + rpc = flint_malloc(2 * sizeof(mp_limb_t) * MAX_ALLOC_SIZE); + + for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++) + { + mp_limb_t borrow; + mp_size_t n; + +#if WANT_STOR + /* Trigger full multiplication in mulhigh */ + if (n_randint(state, 1000) == 0) + n = GET_N_FULL_MUL(state); + else if (n_randint(state, 100) == 0) + n = GET_N_LARGE(state); + else + n = GET_N_SMALL(state); +#else + n = N_MIN + n_randint(state, N_MAX - N_MIN + 1); +#endif + + xp = flint_malloc(sizeof(mp_limb_t) * n); + yp = flint_malloc(sizeof(mp_limb_t) * n); + + mpn_random2(xp, n); + mpn_random2(yp, n); + + if (n <= FLINT_MAX(FLINT_MPN_MULHIGH_MULDERS_CUTOFF, FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH)) + { + /* Check that it is coherent with generic version */ + mp_limb_t ret0, ret1; + + rp = flint_malloc(sizeof(mp_limb_t) * n); + + ret0 = generic_mulhigh_basecase(rpc, xp, yp, n); + ret1 = flint_mpn_mulhigh_n(rp, xp, yp, n); + + result = (mpn_cmp(rp, rpc, n) == 0 && ret0 == ret1); + if (!result) + TEST_FUNCTION_FAIL( + "Basecase not coherent with generic version.\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "Expected ret: %{ulong}\n" + "Got ret: %{ulong}\n" + "Expected limbs: %{ulong*}\n" + "Got limbs: %{ulong*}\n", + ix, n, xp, n, yp, n, ret0, ret1, rpc, n, rp, n); + } + else + { + /* Check that it is coherent with bounds */ + rp = flint_malloc(sizeof(mp_limb_t) * (n + 1)); + + rp[0] = flint_mpn_mulhigh_n(rp + 1, xp, yp, n); + flint_mpn_mul_n(rpc, xp, yp, n); + + result = (mpn_cmp(rp, rpcH, n + 1) <= 0); + if (!result) + TEST_FUNCTION_FAIL( + "Bigger than upper bound!\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "Upper bound: %{ulong*}\n" + "Got: %{ulong*}\n", + ix, n, xp, n, yp, n, rpcH, n + 1, rp, n + 1); + + /* Check lower bound */ + borrow = mpn_sub_1(rpcH, rpcH, n + 1, lower_bound(n)); + if (borrow) + mpn_zero(rpcH, n + 1); + + result = (mpn_cmp(rp, rpcH, n + 1) >= 0); + if (!result) + TEST_FUNCTION_FAIL( + "Smaller than lower bound!\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "Lower bound: %{ulong*}\n" + "Got: %{ulong*}\n", + ix, n, xp, n, yp, n, rpcH, n + 1, rp, n + 1); + } + + flint_free(xp); + flint_free(yp); + flint_free(rp); + } + + flint_free(rpc); + + TEST_FUNCTION_END(state); +} + +#undef N_MIN +#undef N_MAX +#undef WANT_STOR +#undef rpcH +#undef MAX_ALLOC_SIZE + +#undef GET_N_FULL_MUL +#undef GET_N_LARGE +#undef GET_N_SMALL diff --git a/src/mpn_extras/test/t-sqr.c b/src/mpn_extras/test/t-sqr.c index 22efdd33ac..0b7c321778 100644 --- a/src/mpn_extras/test/t-sqr.c +++ b/src/mpn_extras/test/t-sqr.c @@ -50,9 +50,9 @@ TEST_FUNCTION_START(flint_mpn_sqr, state) TEST_FUNCTION_FAIL( "n = %wd\n" "xp = %{ulong*}\n" - "rp1 = %{ulong*}\n" - "rp2 = %{ulong*}\n", - n, xp, n, rp1, 2 * n, rp2, 2 * n); + "Expected: %{ulong*}\n" + "Got: %{ulong*}\n", + n, xp, n, rp2, 2 * n, rp1, 2 * n); flint_free(xp); flint_free(rp1); @@ -61,3 +61,9 @@ TEST_FUNCTION_START(flint_mpn_sqr, state) TEST_FUNCTION_END(state); } + +#undef N_MIN +#undef N_MAX +#undef WANT_BIG +#undef N_MIN_STOR +#undef N_MAX_STOR diff --git a/src/mpn_extras/test/t-sqrhigh.c b/src/mpn_extras/test/t-sqrhigh.c index 6ff0db91b8..ac5ec759bd 100644 --- a/src/mpn_extras/test/t-sqrhigh.c +++ b/src/mpn_extras/test/t-sqrhigh.c @@ -12,13 +12,68 @@ #include "test_helpers.h" #include "mpn_extras.h" -/* TODO: Remove this preprocessor conditional */ -#if FLINT_HAVE_NATIVE_mpn_mulhigh_basecase +#define N_MIN 1 -#define N_MAX FLINT_MAX(FLINT_MPN_SQRHIGH_SQR_CUTOFF + 50, 2 * FLINT_MPN_SQRHIGH_MULDERS_CUTOFF) +#define WANT_STOR 1 + +#if WANT_STOR +# define MAX_ALLOC_SIZE (FLINT_MPN_SQRHIGH_SQR_CUTOFF + 50) +#else +# define N_MAX FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH +# define MAX_ALLOC_SIZE (2 * N_MAX) +#endif + +#define GET_N_FULL_SQR(state) (N_MIN + FLINT_MPN_SQRHIGH_SQR_CUTOFF + n_randint(state, 50 - N_MIN + 1)) +#define GET_N_LARGE(state) (N_MIN + n_randint(state, FLINT_MPN_SQRHIGH_SQR_CUTOFF - N_MIN + 1)) +#define GET_N_SMALL(state) (N_MIN + n_randint(state, 2 * FLINT_MPN_SQRHIGH_MULDERS_CUTOFF - N_MIN + 1)) #define rpcH (rpc + n - 1) +/* Defined in t-mulhigh.c and t-sqrhigh.c */ +#ifndef generic_mulhigh_basecase +# define generic_mulhigh_basecase generic_mulhigh_basecase +static mp_limb_t +generic_mulhigh_basecase(mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t low; + mp_limb_t a, b; + + if (n == 1) + { + umul_ppmm(rp[0], low, up[0], vp[0]); + } + else if (n == 2) + { + FLINT_MPN_MUL_2X2(rp[1], rp[0], low, b, up[1], up[0], vp[1], vp[0]); + } + else if (n == 3) + { + NN_DOTREV_S3_1X1_HIGH(b, a, up, vp, 2); + NN_DOTREV_S3_A3_1X1(b, a, low, 0, b, a, up, vp, 3); + NN_DOTREV_S3_A3_1X1(b, a, rp[0], 0, b, a, up + 1, vp + 1, 2); + NN_ADDMUL_S2_A2_1X1(rp[2], rp[1], b, a, up[2], vp[2]); + } + else + { + mp_limb_t tp[2]; + slong ix; + + NN_DOTREV_S3_1X1_HIGH(b, a, up, vp, n - 1); + NN_DOTREV_S3_A3_1X1(b, a, low, 0, b, a, up, vp, n); + tp[0] = a; + tp[1] = b; + + umul_ppmm(rp[1], rp[0], up[n - 1], vp[1]); + for (ix = 2; ix < n; ix++) + rp[ix] = mpn_addmul_1(rp, up + n - ix, ix, vp[ix]); + + mpn_add(rp, rp, n, tp, 2); + } + + return low; +} +#endif + /* Defined in t-mulhigh.c and t-sqrhigh.c */ #ifndef lower_bound # define lower_bound lower_bound @@ -36,73 +91,105 @@ TEST_FUNCTION_START(flint_mpn_sqrhigh, state) { slong ix; int result; - mp_ptr rp, rpc, xp; - rp = flint_malloc(sizeof(mp_limb_t) * (N_MAX + 1)); - rpc = flint_malloc(2 * sizeof(mp_limb_t) * N_MAX); - xp = flint_malloc(sizeof(mp_limb_t) * N_MAX); + rpc = flint_malloc(2 * sizeof(mp_limb_t) * MAX_ALLOC_SIZE); for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++) { mp_limb_t borrow; mp_size_t n; + +#if WANT_STOR /* Trigger full multiplication in mulhigh */ if (n_randint(state, 1000) == 0) - n = 1 + FLINT_MPN_SQRHIGH_SQR_CUTOFF + n_randint(state, 50); + n = GET_N_FULL_SQR(state); else if (n_randint(state, 100) == 0) - n = 1 + n_randint(state, FLINT_MPN_SQRHIGH_SQR_CUTOFF); + n = GET_N_LARGE(state); else - n = 1 + n_randint(state, 2 * FLINT_MPN_SQRHIGH_MULDERS_CUTOFF); + n = GET_N_SMALL(state); +#else + n = N_MIN + n_randint(state, N_MAX - N_MIN + 1); +#endif + + xp = flint_malloc(sizeof(mp_limb_t) * n); mpn_random2(xp, n); - rp[0] = flint_mpn_sqrhigh(rp + 1, xp, n); - - /* Check upper bound */ - flint_mpn_sqr(rpc, xp, n); - - result = (mpn_cmp(rp, rpcH, n + 1) <= 0); - if (!result) - TEST_FUNCTION_FAIL( - "Bigger than upper bound!\n" - "ix = %wd\n" - "n = %wd\n" - "xp = %{ulong*}\n" - "yp = %{ulong*}\n" - "Upper bound: %{ulong*}\n" - "Got: %{ulong*}\n", - ix, n, xp, n, rpcH, n + 1, rp, n + 1); - - /* Check lower bound */ - borrow = mpn_sub_1(rpcH, rpcH, n + 1, lower_bound(n)); - if (borrow) - mpn_zero(rpcH, n + 1); - - result = (mpn_cmp(rp, rpcH, n + 1) >= 0); - if (!result) - TEST_FUNCTION_FAIL( - "Smaller than lower bound!\n" - "ix = %wd\n" - "n = %wd\n" - "xp = %{ulong*}\n" - "Lower bound: %{ulong*}\n" - "Got: %{ulong*}\n", - ix, n, xp, n, rpcH, n + 1, rp, n + 1); + if (n <= FLINT_MAX(FLINT_MPN_SQRHIGH_MULDERS_CUTOFF, FLINT_MPN_SQRHIGH_FUNC_TAB_WIDTH)) + { + /* Check that it is coherent with generic version */ + mp_limb_t ret0, ret1; + + rp = flint_malloc(sizeof(mp_limb_t) * n); + + ret0 = generic_mulhigh_basecase(rpc, xp, xp, n); + ret1 = flint_mpn_sqrhigh(rp, xp, n); + + result = (mpn_cmp(rp, rpc, n) == 0 && ret0 == ret1); + if (!result) + TEST_FUNCTION_FAIL( + "Basecase not coherent with generic version.\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "Expected ret: %{ulong}\n" + "Got ret: %{ulong}\n" + "Expected limbs: %{ulong*}\n" + "Got limbs: %{ulong*}\n", + ix, n, xp, n, ret0, ret1, rpc, n, rp, n); + } + else + { + /* Check that it is coherent with bounds */ + rp = flint_malloc(sizeof(mp_limb_t) * (n + 1)); + + rp[0] = flint_mpn_sqrhigh(rp + 1, xp, n); + flint_mpn_sqr(rpc, xp, n); + + result = (mpn_cmp(rp, rpcH, n + 1) <= 0); + if (!result) + TEST_FUNCTION_FAIL( + "Bigger than upper bound!\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "Upper bound: %{ulong*}\n" + "Got: %{ulong*}\n", + ix, n, xp, n, rpcH, n + 1, rp, n + 1); + + /* Check lower bound */ + borrow = mpn_sub_1(rpcH, rpcH, n + 1, lower_bound(n)); + if (borrow) + mpn_zero(rpcH, n + 1); + + result = (mpn_cmp(rp, rpcH, n + 1) >= 0); + if (!result) + TEST_FUNCTION_FAIL( + "Smaller than lower bound!\n" + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "Lower bound: %{ulong*}\n" + "Got: %{ulong*}\n", + ix, n, xp, n, rpcH, n + 1, rp, n + 1); + } + + flint_free(xp); + flint_free(rp); } - flint_free(rp); flint_free(rpc); - flint_free(xp); TEST_FUNCTION_END(state); } -# undef N_MAX -# undef rpcH -#else -TEST_FUNCTION_START(flint_mpn_sqrhigh, state) -{ - TEST_FUNCTION_END_SKIPPED(state); -} -#endif +#undef N_MIN +#undef N_MAX +#undef WANT_STOR +#undef rpcH +#undef MAX_ALLOC_SIZE + +#undef GET_N_FULL_SQR +#undef GET_N_LARGE +#undef GET_N_SMALL diff --git a/src/mpn_extras/test/t-sqrhigh_basecase.c b/src/mpn_extras/test/t-sqrhigh_basecase.c deleted file mode 100644 index 7993a94704..0000000000 --- a/src/mpn_extras/test/t-sqrhigh_basecase.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - Copyright (C) 2024 Albin Ahlbäck - - This file is part of FLINT. - - FLINT is free software: you can redistribute it and/or modify it under - the terms of the GNU Lesser General Public License (LGPL) as published - by the Free Software Foundation; either version 3 of the License, or - (at your option) any later version. See . -*/ - -#include "test_helpers.h" -#include "mpn_extras.h" - -/* TODO: Remove this preprocessor conditional */ -#if FLINT_HAVE_NATIVE_mpn_sqrhigh_basecase - -# define N_MIN 1 -# define N_MAX 64 - -TEST_FUNCTION_START(flint_mpn_sqrhigh_basecase, state) -{ - slong ix; - int result; - - for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++) - { - mp_limb_t rp1[N_MAX + 1]; - mp_limb_t rp2[N_MAX + 1]; - mp_limb_t xp[N_MAX]; - mp_size_t n; - - n = N_MIN + n_randint(state, N_MAX - N_MIN + 1); - - mpn_random2(xp, n); - - if (FLINT_HAVE_MULHIGH_FUNC(n)) - rp1[0] = flint_mpn_mulhigh_func_tab[n](rp1 + 1, xp, xp); - else - rp1[0] = _flint_mpn_mulhigh_basecase(rp1 + 1, xp, xp, n); - - rp2[0] = flint_mpn_sqrhigh_basecase(rp2 + 1, xp, n); - - result = (mpn_cmp(rp1, rp2, n + 1) == 0); - if (!result) - TEST_FUNCTION_FAIL( - "Wrong result!\n" - "ix = %wd\n" - "n = %wd\n" - "xp = %{ulong*}\n" - "Expected: %{ulong*}\n" - "Got: %{ulong*}\n", - ix, n, xp, n, rp1, n + 1, rp2, n + 1); - } - - TEST_FUNCTION_END(state); -} -# undef N_MIN -# undef N_MAX -#else -TEST_FUNCTION_START(flint_mpn_sqrhigh_basecase, state) -{ - TEST_FUNCTION_END_SKIPPED(state); -} -#endif diff --git a/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_even.asm b/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_even.asm index 86f4bb0088..79215a840c 100644 --- a/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_even.asm +++ b/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_even.asm @@ -409,12 +409,10 @@ undefine(`ld3') pop %rbx ret - +EPILOGUE() JUMPTABSECT ALIGN(8) L(etab):JMPENT( L(ep0), L(etab)) JMPENT( L(ep1), L(etab)) JMPENT( L(ep2), L(etab)) JMPENT( L(ep3), L(etab)) - TEXT -EPILOGUE() diff --git a/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_odd.asm b/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_odd.asm index 3be67692f7..58767eccf1 100644 --- a/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_odd.asm +++ b/src/mpn_extras/x86_64/broadwell/sqrhigh_basecase_odd.asm @@ -397,12 +397,10 @@ undefine(`ld3') pop %rbx ret - +EPILOGUE() JUMPTABSECT ALIGN(8) L(dtab):JMPENT( L(dp0), L(dtab)) JMPENT( L(dp1), L(dtab)) JMPENT( L(dp2), L(dtab)) JMPENT( L(dp3), L(dtab)) - TEXT -EPILOGUE()