diff --git a/.gitignore b/.gitignore index f7fb28e8f7..ce3195baac 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,5 @@ flint.pc autom4te.cache/ config.m4 src/flint-mparam.h +.gdb_history +vgcore.* diff --git a/Makefile.in b/Makefile.in index b48e007b66..5a09cd6046 100644 --- a/Makefile.in +++ b/Makefile.in @@ -56,6 +56,7 @@ RM_F:=rm -f RM_RF:=rm -rf CP:=cp CP_A:=cp -a +GDB:=gdb STATIC:=@STATIC@ SHARED:=@SHARED@ @@ -498,6 +499,16 @@ else -include $(BUILD_DIR)/test/*.d -include $(BUILD_DIR)/*/test/*.d endif +else ifeq ($(MAKECMDGOALS), debug) +ifdef MOD +$(warning Dependency tracking only set to cover the test executables of $(MOD).) +-include $(foreach dir, $(MOD), $(BUILD_DIR)/$(dir)/test/*.d) +else +-include $(BUILD_DIR)/*/*.o.d +-include $(BUILD_DIR)/*/*.lo.d +-include $(BUILD_DIR)/test/*.d +-include $(BUILD_DIR)/*/test/*.d +endif else ifeq ($(MAKECMDGOALS), tune) -include $(BUILD_DIR)/*/*.o.d -include $(BUILD_DIR)/*/*.lo.d @@ -798,6 +809,25 @@ check: library $(TESTS:%=%_TEST_RUN) @echo 'All tests passed.' endif +################################################################################ +# debugging +################################################################################ + +%_TEST_DBG_RUN_ARGS: % + @$(GDB) --args $< $(ARGS) + +ifdef MOD +ifdef ARGS +DEBUG:=1 +debug: library $(patsubst %,%_TEST_DBG_RUN_ARGS, $($(sort $(MOD))_TESTS)) +endif +endif + +ifneq ($(DEBUG),1) +debug: + $(error Can only run debugger with one module and one argument at a time) +endif + ################################################################################ # tuning ################################################################################ @@ -952,11 +982,11 @@ dist: dev/make_dist.sh $(FLINT_VERSION) ################################################################################ -# debugging +# makefile debugging ################################################################################ print-%: @echo "$*=$($*)" -.PHONY: all library shared static examples checkexamples profile tests check tune valgrind clean distclean install uninstall dist %_TEST_RUN %_VALGRIND_RUN print-% coverage coverage_html -.SILENT: $(mpn_extras_S_SOURCES) +.PHONY: all library shared static examples checkexamples profile tests check tune valgrind clean distclean install uninstall dist %_TEST_RUN %_TEST_RUN_% %_TEST_DGB_RUN_ARGS %_VALGRIND_RUN print-% coverage coverage_html debug +.PRECIOUS: $(mpn_extras_PIC_S_SOURCES) $(mpn_extras_S_SOURCES) diff --git a/src/mpn_extras.h b/src/mpn_extras.h index 1c3a76f000..1d8420ccb1 100644 --- a/src/mpn_extras.h +++ b/src/mpn_extras.h @@ -370,6 +370,36 @@ flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n) flint_mpn_mul((_z), (_y), (_yn), (_x), (_xn)); \ } +/* Low multiplication ********************************************************/ + +#define FLINT_HAVE_MULLOW_FUNC(n) ((n) <= FLINT_MPN_MULLOW_FUNC_TAB_WIDTH) + +FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[]; + +mp_limb_t flint_mpn_mullow_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); + +#if FLINT_HAVE_ASSEMBLY_x86_64_adx +# define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 8 +# define FLINT_HAVE_NATIVE_mpn_mullow_basecase 1 +#else +# define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 0 +#endif + +/* TODO: Fix higher stuff */ +MPN_EXTRAS_INLINE +mp_limb_t flint_mpn_mullow_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + FLINT_ASSERT(n >= 1); + + if (FLINT_HAVE_MULLOW_FUNC(n)) + { + FLINT_ASSERT(rp != xp); + return flint_mpn_mullow_func_tab[n](rp, xp, yp); + } + else + return flint_mpn_mullow_basecase(rp, xp, yp, n); +} + /* High multiplication *******************************************************/ #define FLINT_HAVE_MULHIGH_FUNC(n) ((n) <= FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH) diff --git a/src/mpn_extras/mullow.c b/src/mpn_extras/mullow.c new file mode 100644 index 0000000000..266dfd0a44 --- /dev/null +++ b/src/mpn_extras/mullow.c @@ -0,0 +1,57 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "mpn_extras.h" + +#if FLINT_HAVE_ASSEMBLY_x86_64_adx +mp_limb_t flint_mpn_mullow_1(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mullow_2(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mullow_3(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mullow_4(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mullow_5(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mullow_6(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mullow_7(mp_ptr, mp_srcptr, mp_srcptr); +mp_limb_t flint_mpn_mullow_8(mp_ptr, mp_srcptr, mp_srcptr); + +const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[] = +{ + NULL, + flint_mpn_mullow_1, + flint_mpn_mullow_2, + flint_mpn_mullow_3, + flint_mpn_mullow_4, + flint_mpn_mullow_5, + flint_mpn_mullow_6, + flint_mpn_mullow_7, + flint_mpn_mullow_8 +}; +#else +const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[] = { NULL }; +#endif + +#if !FLINT_HAVE_NATIVE_mpn_mullow_basecase +mp_limb_t +flint_mpn_mullow_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + mp_limb_t ret; + mp_size_t ix; + + ret = mpn_mul_1(rp, xp, n, yp[0]); + + for (ix = 1; ix < n; ix++) + { + ret += mpn_addmul_1(rp + ix, xp, n - ix, yp[ix]); + ret += xp[n - ix] * yp[ix]; + } + + return ret; +} +#endif diff --git a/src/mpn_extras/profile/p-mullow.c b/src/mpn_extras/profile/p-mullow.c new file mode 100644 index 0000000000..6a82befd61 --- /dev/null +++ b/src/mpn_extras/profile/p-mullow.c @@ -0,0 +1,74 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "mpn_extras.h" +#include "profiler.h" + +#define mpn_mullo_basecase __gmpn_mullo_basecase +void mpn_mullo_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t); + +#define N_MIN 1 +#define N_MAX 30 + +#if N_MAX < 9 +# define P 1 +#elif N_MAX < 99 +# define P 2 +#elif N_MAX < 999 +# define P 3 +#elif N_MAX < 9999 +# define P 4 +#else +# define P 5 +#endif + +#define _STR(x) #x +#define STR(x) _STR(x) + +int +main(void) +{ + mp_limb_t rp[2 * N_MAX]; + mp_limb_t xp[N_MAX]; + mp_limb_t yp[N_MAX]; + mp_size_t n; + + flint_printf("%.*s mullo_basecase / FLINT || mul / mullow\n", P, " "); + for (n = N_MIN; n <= N_MAX; n++) + { + double t1, t2, t3, FLINT_SET_BUT_UNUSED(__); + flint_printf("n = %" STR(P) "wd:", n); + + mpn_random2(xp, n); + mpn_random2(yp, n); + + TIMEIT_START + mpn_mullo_basecase(rp, xp, yp, n); + TIMEIT_STOP_VALUES(__, t1) + + TIMEIT_START + flint_mpn_mul_n(rp, xp, yp, n); + TIMEIT_STOP_VALUES(__, t2) + + TIMEIT_START + flint_mpn_mullow_n(rp, xp, yp, n); + TIMEIT_STOP_VALUES(__, t3) + + flint_printf(" %7.2fx || %7.2f\n", t1 / t3, t2 / t3); + } + + flint_cleanup_master(); + + return 0; +} + +#undef N_MIN +#undef N_MAX diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c index 0d8ab7c6ac..e28b927bd1 100644 --- a/src/mpn_extras/test/main.c +++ b/src/mpn_extras/test/main.c @@ -24,6 +24,7 @@ #include "t-mul.c" #include "t-mul_n.c" #include "t-mul_toom22.c" +#include "t-mullow_n.c" #include "t-mulhigh.c" #include "t-mulhigh_normalised.c" #include "t-mulmod_2expp1.c" @@ -48,6 +49,7 @@ test_struct tests[] = TEST_FUNCTION(flint_mpn_mul), TEST_FUNCTION(flint_mpn_mul_n), TEST_FUNCTION(flint_mpn_mul_toom22), + TEST_FUNCTION(flint_mpn_mullow_n), TEST_FUNCTION(flint_mpn_mulhigh), TEST_FUNCTION(flint_mpn_mulhigh_normalised), TEST_FUNCTION(flint_mpn_mulmod_2expp1), diff --git a/src/mpn_extras/test/t-mullow_n.c b/src/mpn_extras/test/t-mullow_n.c new file mode 100644 index 0000000000..388fd330fb --- /dev/null +++ b/src/mpn_extras/test/t-mullow_n.c @@ -0,0 +1,69 @@ +/* + Copyright (C) 2024 Albin Ahlbäck + + This file is part of FLINT. + + FLINT is free software: you can redistribute it and/or modify it under + the terms of the GNU Lesser General Public License (LGPL) as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. See . +*/ + +#include "test_helpers.h" +#include "mpn_extras.h" + +#define N_MIN 1 +#define N_MAX 30 + +TEST_FUNCTION_START(flint_mpn_mullow_n, state) +{ + slong ix; + int result; + + mp_ptr rp, rpf, xp, yp; + + for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++) + { + mp_limb_t ret; + mp_size_t n; + + n = N_MIN + n_randint(state, N_MAX - N_MIN + 1); + + rp = flint_malloc(sizeof(mp_limb_t) * n); + rpf = flint_malloc(2 * sizeof(mp_limb_t) * n); + xp = flint_malloc(sizeof(mp_limb_t) * n); + yp = flint_malloc(sizeof(mp_limb_t) * n); + + mpn_random2(xp, n); + mpn_random2(yp, n); + + if (ix < 1) + continue; + + ret = flint_mpn_mullow_n(rp, xp, yp, n); + flint_mpn_mul_n(rpf, xp, yp, n); + + result = (mpn_cmp(rp, rpf, n) == 0 && ret == rpf[n]); + if (!result) + TEST_FUNCTION_FAIL( + "ix = %wd\n" + "n = %wd\n" + "xp = %{ulong*}\n" + "yp = %{ulong*}\n" + "Exp ret: %{ulong}\n" + "Got ret: %{ulong}\n" + "Expected: %{ulong*}\n" + "Got: %{ulong*}\n", + ix, n, xp, n, yp, n, rpf[n], ret, rpf, n, rp, n); + + flint_free(rp); + flint_free(rpf); + flint_free(xp); + flint_free(yp); + } + + TEST_FUNCTION_END(state); +} + +#undef N_MIN +#undef N_MAX diff --git a/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm b/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm new file mode 100644 index 0000000000..3fa98bee51 --- /dev/null +++ b/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm @@ -0,0 +1,484 @@ +dnl +dnl Copyright 2017 Free Software Foundation, Inc. +dnl Contributed to the GNU project by Torbjorn Granlund. +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`config.m4') + +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp_param', `%rdx') +define(`n', `%rcx') + +define(`bp', `%r11') + +define(`jmpreg',`%r15') +define(`nn', `%rbp') +define(`mm', `%rbx') + +define(`s0', `%r8') +define(`s1', `%r9') +define(`s2', `%r10') +define(`s3', `%r12') +define(`s4', `%r13') +define(`s5', `%r14') + +define(`sx', `%rax') + +dnl Scheme: +dnl 0 1 2 3 4 5 6 7 8 +dnl 0 x x x x x x x x x +dnl 1 x x x x x x x x l +dnl 2 x x x x x x x l +dnl 3 x x x x x x l +dnl 4 x x x x x l +dnl 5 x x x x l +dnl 6 x x x l +dnl 7 x x l +dnl 8 x l + +dnl NOTE: Requires n > 8. + + TEXT + ALIGN(32) +PROLOGUE(flint_mpn_mullow_basecase) + push mm + push nn + push s3 + push s4 + push s5 + push jmpreg + + lea -2(n), R32(nn) + lea 1*8(bp_param), bp C Prepare bp for addmul_1 + mov 0*8(bp_param), %rdx C Load rdx for mul_1 + + mov R32(n), R32(sx) + shr $3, R32(n) + and $7, R32(sx) C clear OF, CF as side-effect + lea L(mtab)(%rip), s0 +ifdef(`PIC', +` movslq (s0,sx,4), sx + lea (sx,s0), s0 + jmp *s0 +',` + jmp *(s0,sx,8) +') + +L(mf0): mulx 0*8(ap), s0, s2 + lea 7*8(ap), ap + lea -1*8(rp), rp + lea L(f0)(%rip), jmpreg + jmp L(mb0) + +L(mf3): mulx 0*8(ap), s1, sx + lea 2*8(ap), ap + lea 2*8(rp), rp + inc R32(n) + lea L(f3)(%rip), jmpreg + jmp L(mb3) + +L(mf4): mulx 0*8(ap), s0, s2 + lea 3*8(ap), ap + lea 3*8(rp), rp + inc R32(n) + lea L(f4)(%rip), jmpreg + jmp L(mb4) + +L(mf5): mulx 0*8(ap), s1, sx + lea 4*8(ap), ap + lea 4*8(rp), rp + inc R32(n) + lea L(f5)(%rip), jmpreg + jmp L(mb5) + +L(mf6): mulx 0*8(ap), s0, s2 + lea 5*8(ap), ap + lea 5*8(rp), rp + inc R32(n) + lea L(f6)(%rip), jmpreg + jmp L(mb6) + +L(mf7): mulx 0*8(ap), s1, sx + lea 6*8(ap), ap + lea 6*8(rp), rp + inc R32(n) + lea L(f7)(%rip), jmpreg + jmp L(mb7) + +L(mf1): mulx 0*8(ap), s1, sx + lea L(f1)(%rip), jmpreg + jmp L(mb1) + +L(mf2): mulx 0*8(ap), s0, s2 + lea 1*8(ap), ap + lea 1*8(rp), rp + lea L(f2)(%rip), jmpreg + mulx 0*8(ap), s1, sx + + ALIGN(32) +L(mtop):mov s0, -1*8(rp) + adc s2, s1 +L(mb1): mulx 1*8(ap), s0, s2 + adc sx, s0 + lea 8*8(ap), ap + mov s1, 0*8(rp) +L(mb0): mov s0, 1*8(rp) + mulx -6*8(ap), s1, sx + lea 8*8(rp), rp + adc s2, s1 +L(mb7): mulx -5*8(ap), s0, s2 + mov s1, -6*8(rp) + adc sx, s0 +L(mb6): mov s0, -5*8(rp) + mulx -4*8(ap), s1, sx + adc s2, s1 +L(mb5): mulx -3*8(ap), s0, s2 + mov s1, -4*8(rp) + adc sx, s0 +L(mb4): mulx -2*8(ap), s1, sx + mov s0, -3*8(rp) + adc s2, s1 +L(mb3): mulx -1*8(ap), s0, s2 + adc sx, s0 + mov s1, -2*8(rp) + dec R32(n) + mulx 0*8(ap), s1, sx + jnz L(mtop) + + lea 1*8(,nn,8), R32(mm) + mov s0, -1*8(rp) + adc s2, s1 + mov 0*8(bp), %rdx + mov s1, 0*8(rp) + adc n, sx C n = 0 + shr $3, R32(nn) + neg mm + or R32(nn), R32(n) C Reset n, clear OF and CF + lea 1*8(rp,mm), rp C Reset rp + lea (ap,mm), ap C Reset ap + jmp *jmpreg + + nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop; +L(f7): mulx 0*8(ap), s0, s2 + lea 5*8(ap), ap + lea -3*8(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(b7) + +L(f6): mulx 0*8(ap), s1, s3 + lea 4*8(ap), ap + lea -4*8(rp), rp + lea L(f5)(%rip), jmpreg + jmp L(b6) + +L(end): adox 0*8(rp), s1 + mulx 1*8(ap), s0, s2 C Only s0 is used + lea 1*8(mm), mm + adox s3, sx + mov s1, 0*8(rp) + lea 1*8(bp), bp + adc s0, sx + lea (ap,mm), ap C Reset ap + mov 0*8(bp), %rdx + or R32(nn), R32(n) C Reset count, clear CF and OF + lea 1*8(rp,mm), rp C Reset rp + jmp *jmpreg + +L(f0): mulx 0*8(ap), s1, s3 + lea -2*8(ap), ap + lea -2*8(rp), rp + lea L(f7)(%rip), jmpreg + jmp L(b0) + +L(f3): mulx 0*8(ap), s0, s2 + lea 1*8(ap), ap + lea 1*8(rp), rp + mulx 0*8(ap), s1, s3 + lea L(f2)(%rip), jmpreg + + ALIGN(32) +L(top): adox -1*8(rp), s0 + adcx s2, s1 + mov s0, -1*8(rp) + jrcxz L(end) +L(b2): mulx 1*8(ap), s0, s2 + adox 0*8(rp), s1 + lea -1(n), R32(n) + mov s1, 0*8(rp) + adcx s3, s0 +L(b1): mulx 2*8(ap), s1, s3 + adcx s2, s1 + adox 1*8(rp), s0 + mov s0, 1*8(rp) +L(b0): mulx 3*8(ap), s0, s2 + lea 8*8(ap), ap + adcx s3, s0 + adox 2*8(rp), s1 + mov s1, 2*8(rp) +L(b7): mulx -4*8(ap), s1, s3 + adox 3*8(rp), s0 + adcx s2, s1 + mov s0, 3*8(rp) +L(b6): mulx -3*8(ap), s0, s2 + adcx s3, s0 + adox 4*8(rp), s1 + mov s1, 4*8(rp) +L(b5): mulx -2*8(ap), s1, s3 + adox 5*8(rp), s0 + adcx s2, s1 + mov s0, 5*8(rp) +L(b4): adox 6*8(rp), s1 + mulx -1*8(ap), s0, s2 + mov s1, 6*8(rp) + lea 8*8(rp), rp + adcx s3, s0 + mulx 0*8(ap), s1, s3 + jmp L(top) + +L(f5): mulx 0*8(ap), s0, s2 + lea 3*8(ap), ap + lea -5*8(rp), rp + lea L(f4)(%rip), jmpreg + jmp L(b5) + +L(f4): mulx 0*8(ap), s1, s3 + lea 2*8(ap), ap + lea -6*8(rp), rp + lea L(f3)(%rip), jmpreg + jmp L(b4) + +L(f2): mulx 0*8(ap), s1, s3 + lea -1(nn), R32(nn) + lea L(f1)(%rip), jmpreg + jmp L(b2) + +L(f1): mulx 0*8(ap), s0, s2 + jrcxz L(cor) + lea -1*8(ap), ap + lea -1*8(rp), rp + lea L(f0)(%rip), jmpreg + jmp L(b1) + +define(`t0', `s0') +define(`t1', `s2') +define(`t2', `s1') +define(`t3', `s3') +define(`t4', `s4') +define(`t5', `s5') +define(`t6', `jmpreg') +define(`t7', `mm') +define(`t8', `nn') +define(`t9', `n') +define(`tx', `sx') +L(cor): mulx 1*8(ap), t2, t3 + mulx 2*8(ap), t4, t5 + adcx 0*8(rp), t0 + adox t1, t2 + mulx 3*8(ap), t6, t7 + adcx 1*8(rp), t2 + adox t3, t4 + mulx 4*8(ap), t8, t9 + adcx 2*8(rp), t4 + adox t5, t6 + mulx 5*8(ap), t1, t3 + adcx 3*8(rp), t6 + adox t7, t8 + mulx 6*8(ap), t5, t7 + mov t0, 0*8(rp) + adcx 4*8(rp), t8 + adox t9, t1 + mulx 7*8(ap), t0, t9 + adcx 5*8(rp), t1 + adox t3, t5 + mulx 8*8(ap), t3, %rdx C %rdx unused + adcx 6*8(rp), t5 + adox t7, t0 + adcx 7*8(rp), t0 + adox t9, tx + C 2, 4, 6, 8, 1, 5, 0, x + + mov 1*8(bp), %rdx + mulx 0*8(ap), t7, t9 + adc t3, tx + test %al, %al C Reset OF and CF + adcx t7, t2 + adox t9, t4 + mov t2, 1*8(rp) + mulx 1*8(ap), t7, t9 + mulx 2*8(ap), t2, t3 + adcx t7, t4 + adox t9, t6 + mulx 3*8(ap), t7, t9 + adcx t2, t6 + adox t3, t8 + mulx 4*8(ap), t2, t3 + adcx t7, t8 + adox t9, t1 + mulx 5*8(ap), t7, t9 + adcx t2, t1 + adox t3, t5 + mulx 6*8(ap), t2, t3 + adcx t7, t5 + adox t9, t0 + mulx 7*8(ap), t7, t9 C t9 unused + adcx t2, t0 + adox t3, tx + C 4, 6, 8, 1, 5, 0, x + + mov 2*8(bp), %rdx + mulx 0*8(ap), t2, t3 + adc t7, tx + test %al, %al + mulx 1*8(ap), t7, t9 + adcx t2, t4 + adox t3, t6 + mov t4, 2*8(rp) + mulx 2*8(ap), t2, t3 + adcx t7, t6 + adox t9, t8 + mulx 3*8(ap), t7, t9 + adcx t2, t8 + adox t3, t1 + mulx 4*8(ap), t2, t3 + adcx t7, t1 + adox t9, t5 + mulx 5*8(ap), t7, t9 + adcx t2, t5 + adox t3, t0 + mulx 6*8(ap), t2, t3 C t3 unused + adcx t7, t0 + adox t9, tx + C 6, 8, 1, 5, 0, x + + mov 3*8(bp), %rdx + mulx 0*8(ap), t4, t7 + adc t2, tx + test %al, %al + mulx 1*8(ap), t2, t3 + adcx t4, t6 + adox t7, t8 + mov t6, 3*8(rp) + mulx 2*8(ap), t4, t7 + mulx 3*8(ap), t6, t9 + adcx t2, t8 + adox t3, t1 + mulx 4*8(ap), t2, t3 + adcx t4, t1 + adox t7, t5 + mulx 5*8(ap), t4, t7 C t7 unused + adcx t6, t5 + adox t9, t0 + adcx t2, t0 + adox t3, tx + C 8, 1, 5, 0, x + + mov 4*8(bp), %rdx + mulx 0*8(ap), t2, t3 + adc t4, tx + test %al, %al + mulx 1*8(ap), t4, t6 + mulx 2*8(ap), t7, t9 + adcx t2, t8 + adox t3, t1 + mulx 3*8(ap), t2, t3 + adcx t4, t1 + adox t6, t5 + mulx 4*8(ap), t4, t6 C t6 unused + adcx t7, t5 + adox t9, t0 + mov t8, 4*8(rp) + adox t3, tx + adc t2, t0 + C 1, 5, 0, x + + mov 5*8(bp), %rdx + mulx 0*8(ap), t2, t3 + adc t4, tx + mulx 1*8(ap), t4, t6 + mulx 2*8(ap), t7, t8 + imul 3*8(ap), %rdx + add t2, t1 + adc t3, t5 + mov t1, 5*8(rp) + adc t6, t0 + adc t8, tx + add t4, t5 + adc t7, t0 + adc %rdx, tx + C 5, 0, x + + mov 6*8(bp), %rdx + mulx 0*8(ap), t1, t2 + mulx 1*8(ap), t3, t4 + imul 2*8(ap), %rdx + add t1, t5 + adc t2, t0 + mov t5, 6*8(rp) + adc t4, tx + add t3, t0 + adc %rdx, tx + C 0, x + + mov 7*8(bp), %rdx + mulx 0*8(ap), t1, t2 + imul 1*8(ap), %rdx + + pop jmpreg + pop s5 + pop s4 + pop s3 + pop nn + pop mm + + add t1, t0 + adc t2, tx + mov t0, 7*8(rp) + add %rdx, tx + + ret +EPILOGUE() + JUMPTABSECT + ALIGN(8) +L(mtab):JMPENT( L(mf0), L(mtab)) + JMPENT( L(mf1), L(mtab)) + JMPENT( L(mf2), L(mtab)) + JMPENT( L(mf3), L(mtab)) + JMPENT( L(mf4), L(mtab)) + JMPENT( L(mf5), L(mtab)) + JMPENT( L(mf6), L(mtab)) + JMPENT( L(mf7), L(mtab)) diff --git a/src/mpn_extras/x86_64/broadwell/mullow_hard.asm b/src/mpn_extras/x86_64/broadwell/mullow_hard.asm new file mode 100644 index 0000000000..24d0e92693 --- /dev/null +++ b/src/mpn_extras/x86_64/broadwell/mullow_hard.asm @@ -0,0 +1,744 @@ +dnl +dnl Copyright (C) 2024 Albin Ahlbäck +dnl +dnl This file is part of FLINT. +dnl +dnl FLINT is free software: you can redistribute it and/or modify it under +dnl the terms of the GNU Lesser General Public License (LGPL) as published +dnl by the Free Software Foundation; either version 3 of the License, or +dnl (at your option) any later version. See . +dnl + +include(`config.m4') + +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp_param', `%rdx') + +define(`bp', `%r8') + +define(`s0', `%rcx') +define(`s1', `%r9') +define(`s2', `%r10') +define(`s3', `%r11') +define(`s4', `%rbx') +define(`s5', `%rbp') +define(`s6', `%r12') +define(`s7', `%r13') +define(`s8', `%r14') +define(`s9', `%r15') + +define(`sx', `%rax') + + TEXT + + ALIGN(16) +PROLOGUE(flint_mpn_mullow_1) + mov 0*8(bp_param), %rdx + mulx 0*8(ap), s0, sx C a0 b0 + mov s0, 0*8(rp) + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_mullow_2) + mov 1*8(bp_param), sx + mov 0*8(bp_param), %rdx + + mulx 0*8(ap), s0, s1 C a0 b0 + mulx 1*8(ap), s2, s3 C a1 b0 + mov sx, %rdx + mulx 0*8(ap), bp_param, bp C a0 b1 + imul 1*8(ap), sx C L(a1 b1) + C s0, (s1, s2, bp_param), (s3, bp, sx) + + mov s0, 0*8(rp) + + add s2, s1 + adc s3, bp + C (s1, bp_param), (bp, sx) + + add bp_param, s1 + adc bp, sx + C s1, sx + + mov s1, 1*8(rp) + + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_mullow_3) + push s4 + push s5 + + mov bp_param, bp + + mov 0*8(bp_param), %rdx + mulx 0*8(ap), s0, s1 C a0 b0 + mulx 1*8(ap), s2, s3 C a1 b0 + mulx 2*8(ap), s4, sx C a2 b0 + C 0, (1, 2), (3, 4), x + + mov s0, 0*8(rp) + add s2, s1 + + mov 1*8(bp), %rdx + mulx 0*8(ap), s0, s2 C a0 b1 + adc s4, s3 + adc $0, sx + mulx 1*8(ap), s4, s5 C a1 b1 + imul 2*8(ap), %rdx C L(a2 b1) + C 0, (2, 4), (5, rdx) + + add s0, s1 + adc s2, s3 + mov s1, 1*8(rp) + adc %rdx, sx + + mov 2*8(bp), %rdx + mulx 0*8(ap), s0, s2 C a0 b2 + imul 1*8(ap), %rdx C L(a1 b2) + C 0, (2, rdx) + + add s4, s3 + adc s5, sx + + pop s5 + pop s4 + + add s2, sx + add s0, s3 + adc %rdx, sx + mov s3, 2*8(rp) + + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_mullow_4) + mov bp_param, bp + + push s4 + push s5 + push s6 + push s7 + push s8 + + mov 0*8(bp_param), %rdx + mulx 0*8(ap), s0, s1 C a0 b0 + mulx 1*8(ap), s2, s3 C a1 b0 + mulx 2*8(ap), s4, s5 C a2 b0 + mulx 3*8(ap), s6, sx C a3 b0 + C 0, (1, 2), (3, 4), (5, 6), x + + mov s0, 0*8(rp) + add s2, s1 + adc s4, s3 + adc s6, s5 + adc $0, sx + C 1, 3, 5, x + + mov 1*8(bp), %rdx + mulx 0*8(ap), s0, s2 C a0 b1 + mulx 1*8(ap), s4, s6 C a1 b1 + mulx 2*8(ap), s7, s8 C a2 b1 + imul 3*8(ap), %rdx C L(a3 b1) + C 0, (2, 4), (6, 7), (8, rdx) + + add s0, s1 + adc s2, s3 + adc s6, s5 + adc s8, sx + mov s1, 1*8(rp) + add s4, s3 + adc s7, s5 + adc %rdx, sx + C 3, 5, x + + mov 2*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b2 + mulx 1*8(ap), s2, s4 C a1 b2 + imul 2*8(ap), %rdx C L(a2 b2) + C 0, (1, 2), (4, rdx) + + add s0, s3 + adc s1, s5 + mov s3, 2*8(rp) + adc s4, sx + add s5, s2 + adc %rdx, sx + C 2, x + + mov 3*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b3 + imul 1*8(ap), %rdx C L(a1 b3) + C 0, (1, rdx) + + pop s8 + pop s7 + pop s6 + pop s5 + pop s4 + + add s0, s2 + adc s1, sx + mov s2, 3*8(rp) + add %rdx, sx + + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_mullow_5) + mov bp_param, bp + + push s4 + push s5 + push s6 + push s7 + push s8 + push s9 + + mov 0*8(bp_param), %rdx + mulx 0*8(ap), s0, s1 C a0 b0 + mulx 1*8(ap), s2, s3 C a1 b0 + mulx 2*8(ap), s4, s5 C a2 b0 + mulx 3*8(ap), s6, s7 C a3 b0 + mulx 4*8(ap), s8, sx C a4 b0 + C 0, (1, 2), (3, 4), (5, 6), (7, 8), x + + mov s0, 0*8(rp) + add s2, s1 + adc s4, s3 + adc s6, s5 + adc s8, s7 + adc $0, sx + C 1, 3, 5, 7, x + + mov 1*8(bp), %rdx + mulx 0*8(ap), s0, s2 C a0 b1 + mulx 1*8(ap), s4, s6 C a1 b1 + mulx 2*8(ap), s8, s9 C a2 b1 + add s0, s1 + adc s2, s3 + mov s1, 1*8(rp) + mulx 3*8(ap), s0, s2 C a3 b1 + mulx 4*8(ap), %rdx, s1 C a4 b1, but we only use %rdx + C -, (-, 4), (6, 8), (9, 0), (2, %rdx) + + adc s6, s5 + adc s9, s7 + adc s2, sx + add s4, s3 + adc s8, s5 + adc s0, s7 + adc %rdx, sx + C 3, 5, 7, x + + mov 2*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b2 + mulx 1*8(ap), s2, s4 C a1 b2 + mulx 2*8(ap), s6, s8 C a2 b2 + imul 3*8(ap), %rdx C L(a3 b2) + C 0, (1, 2), (4, 6), (8, rdx) + + add s0, s3 + adc s1, s5 + adc s4, s7 + adc s8, sx + add s2, s5 + adc s6, s7 + adc %rdx, sx + mov s3, 2*8(rp) + C 5, 7, x + + mov 3*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b3 + mulx 1*8(ap), s2, s3 C a1 b3 + imul 2*8(ap), %rdx C L(a2 b3) + C 0, (1, 2), (3, rdx) + + add s0, s5 + adc s1, s7 + mov s5, 3*8(rp) + adc s3, sx + add s7, s2 + adc %rdx, sx + C 2, x + + mov 4*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b4 + imul 1*8(ap), %rdx C L(a1 b4) + C 0, (1, rdx) + + pop s9 + pop s8 + pop s7 + pop s6 + pop s5 + pop s4 + + add s0, s2 + adc s1, sx + mov s2, 4*8(rp) + add %rdx, sx + + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_mullow_6) + mov bp_param, bp + + push s4 + push s5 + push s6 + push s7 + push s8 + push s9 + + mov 0*8(bp_param), %rdx + mulx 0*8(ap), s0, s1 C a0 b0 + mulx 1*8(ap), s2, s3 C a1 b0 + mulx 2*8(ap), s4, s5 C a2 b0 + mulx 3*8(ap), s6, s7 C a3 b0 + mulx 4*8(ap), s8, s9 C a4 b0 + mulx 5*8(ap), %rdx, sx C a5 b0 + C 0, (1, 2), (3, 4), (5, 6), (7, 8), (9, rdx), x + + add s2, s1 + adc s4, s3 + adc s6, s5 + mov s0, 0*8(rp) + adc s8, s7 + adc %rdx, s9 + adc $0, sx + C 1, 3, 5, 7, 9, x + + test %al, %al + mov 1*8(bp), %rdx + mulx 0*8(ap), s2, s4 C a0 b1 + mulx 1*8(ap), s6, s8 C a1 b1 + adcx s2, s1 + adox s4, s3 + mov s1, 1*8(rp) + mulx 2*8(ap), s2, s4 C a2 b1 + adcx s6, s3 + adox s8, s5 + mulx 3*8(ap), s6, s8 C a3 b1 + adcx s2, s5 + adox s4, s7 + mulx 4*8(ap), s2, s4 C a4 b1 + adcx s6, s7 + adox s8, s9 + mulx 5*8(ap), %rdx, s6 C a5 b1, but we only use %rdx + C -, (-, -), (-, -), (-, -), (-, 2), (4, %rdx) + + adcx s2, s9 + adox s4, sx + adc %rdx, sx + C 3, 5, 7, 9, x + + mov 2*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b2 + mulx 1*8(ap), s2, s4 C a1 b2 + mulx 2*8(ap), s6, s8 C a2 b2 + add s0, s3 + adc s1, s5 + mov s3, 2*8(rp) + mulx 3*8(ap), s0, s1 C a3 b2 + mulx 4*8(ap), %rdx, s3 C a4 b2, but we only use %rdx + C -, (-, 2), (4, 6), (8, 0), (1, rdx) + + adc s4, s7 + adc s8, s9 + adc s1, sx + add s2, s5 + adc s6, s7 + adc s0, s9 + adc %rdx, sx + C 5, 7, 9, x + + mov 3*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b3 + mulx 1*8(ap), s2, s3 C a1 b3 + mulx 2*8(ap), s4, s6 C a2 b3 + imul 3*8(ap), %rdx C L(a3 b3) + C 0, (1, 2), (3, 4), (6, rdx) + + add s0, s5 + adc s1, s7 + adc s3, s9 + mov s5, 3*8(rp) + adc s6, sx + add s2, s7 + adc s4, s9 + adc %rdx, sx + C 7, 9, x + + mov 4*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b4 + mulx 1*8(ap), s2, s3 C a1 b4 + imul 2*8(ap), %rdx C L(a2 b4) + C 0, (1, 2), (3, rdx) + + add s0, s7 + adc s1, s9 + adc s3, sx + mov s7, 4*8(rp) + add s9, s2 + adc %rdx, sx + C 2, x + + mov 5*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b5 + imul 1*8(ap), %rdx C L(a1 b5) + C 0, (1, rdx) + + pop s9 + pop s8 + pop s7 + pop s6 + pop s5 + pop s4 + + add s0, s2 + adc s1, sx + mov s2, 5*8(rp) + add %rdx, sx + + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_mullow_7) + mov bp_param, bp + + push s4 + push s5 + push s6 + push s7 + push s8 + push s9 + + mov 0*8(bp_param), %rdx + mulx 0*8(ap), s0, s1 C a0 b0 + mulx 1*8(ap), s2, s3 C a1 b0 + mulx 2*8(ap), s4, s5 C a2 b0 + mulx 3*8(ap), s6, s7 C a3 b0 + mov s0, 0*8(rp) + mulx 4*8(ap), s8, s9 C a4 b0 + add s2, s1 + adc s4, s3 + mulx 5*8(ap), s0, s2 C a5 b0 + adc s6, s5 + adc s8, s7 + mulx 6*8(ap), s4, sx C a6 b0 + C 0, (1, -), (3, -), (5, -), (7, -), (9, 0), (2, 4), x + + adc s0, s9 + adc s4, s2 + mov 1*8(bp), %rdx + adc $0, sx + C 1, 3, 5, 7, 9, 2, x + + test %al, %al + mulx 0*8(ap), s0, s4 C a0 b1 + mulx 1*8(ap), s6, s8 C a1 b1 + adcx s0, s1 + adox s4, s3 + mulx 2*8(ap), s0, s4 C a2 b1 + adcx s6, s3 + adox s8, s5 + mulx 3*8(ap), s6, s8 C a3 b1 + adcx s0, s5 + adox s4, s7 + mulx 4*8(ap), s0, s4 C a4 b1 + adcx s6, s7 + adox s8, s9 + mulx 5*8(ap), s6, s8 C a5 b1 + adcx s0, s9 + adox s4, s2 + mulx 6*8(ap), %rdx, s0 C a6 b1, but we only use %rdx + adcx s6, s2 + adox s8, sx + mov s1, 1*8(rp) + adc %rdx, sx + C 3, 5, 7, 9, 2, x + + mov 2*8(bp), %rdx + test %al, %al + mulx 0*8(ap), s0, s1 C a0 b2 + mulx 1*8(ap), s4, s6 C a1 b2 + adcx s0, s3 + adox s1, s5 + mov s3, 2*8(rp) + mulx 2*8(ap), s0, s1 C a2 b2 + adcx s4, s5 + adox s6, s7 + mulx 3*8(ap), s4, s6 C a3 b2 + adcx s0, s7 + adox s1, s9 + mulx 4*8(ap), s0, s1 C a4 b2 + adcx s4, s9 + adox s6, s2 + mulx 5*8(ap), %rdx, s3 C a5 b2, but we only use %rdx + adcx s0, s2 + adox s1, sx + adc %rdx, sx + C 5, 7, 9, 2, x + + mov 3*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b3 + mulx 1*8(ap), s3, s4 C a1 b3 + mulx 2*8(ap), s6, s8 C a2 b3 + add s0, s5 + adc s3, s1 + adc $0, s4 + mulx 3*8(ap), s0, s3 C a3 b3 + imul 4*8(ap), %rdx C L(a4 b3) + C -, (1, -), (4, 6), (8, 0), (3, rdx) + + mov s5, 3*8(rp) + add s1, s7 + adc s4, s9 + adc s8, s2 + adc s3, sx + add s6, s9 + adc s0, s2 + adc %rdx, sx + C 7, 9, 2, x + + mov 4*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b4 + mulx 1*8(ap), s3, s4 C a1 b4 + mulx 2*8(ap), s5, s6 C a2 b4 + imul 3*8(ap), %rdx C L(a3 b4) + C 0, (1, 3), (4, 5), (6, rdx) + + add s0, s7 + adc s1, s9 + adc s4, s2 + mov s7, 4*8(rp) + adc s6, sx + add s9, s3 + adc s5, s2 + adc %rdx, sx + C 3, 2, x + + mov 5*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b5 + mulx 1*8(ap), s4, s5 C a1 b5 + imul 2*8(ap), %rdx C L(a2 b5) + C 0, (1, 4), (5, rdx) + + pop s9 + pop s8 + + add s0, s3 + adc s1, s2 + adc s5, sx + mov s3, 5*8(rp) + add s4, s2 + adc %rdx, sx + C 2, x + + mov 6*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b6 + imul 1*8(ap), %rdx C L(a1 b6) + C 0, (1, rdx) + + pop s7 + pop s6 + pop s5 + pop s4 + + add s0, s2 + adc s1, sx + mov s2, 6*8(rp) + add %rdx, sx + + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(flint_mpn_mullow_8) + mov bp_param, bp + + push s4 + push s5 + push s6 + push s7 + push s8 + push s9 + + mov 0*8(bp_param), %rdx + mulx 0*8(ap), s0, s1 C a0 b0 + mulx 1*8(ap), s2, s3 C a1 b0 + mulx 2*8(ap), s4, s5 C a2 b0 + mulx 3*8(ap), s6, s7 C a3 b0 + mov s0, 0*8(rp) + mulx 4*8(ap), s8, s9 C a4 b0 + add s2, s1 + adc s4, s3 + mulx 5*8(ap), s0, s2 C a5 b0 + adc s6, s5 + adc s8, s7 + mulx 6*8(ap), s4, s6 C a6 b0 + mulx 7*8(ap), s8, sx C a7 b0 + C -, (1, -), (3, -), (5, -), (7, -), (9, 0), (2, 4), (6, 8), x + + adc s0, s9 + adc s4, s2 + adc s8, s6 + mov 1*8(bp), %rdx + adc $0, sx + C 1, 3, 5, 7, 9, 2, 6, x + + test %al, %al + mulx 0*8(ap), s0, s4 C a0 b1 + adcx s0, s1 + adox s4, s3 + mov s1, 1*8(rp) + mulx 1*8(ap), s8, s0 C a1 b1 + mulx 2*8(ap), s4, s1 C a2 b1 + adcx s8, s3 + adox s0, s5 + mulx 3*8(ap), s8, s0 C a3 b1 + adcx s4, s5 + adox s1, s7 + mulx 4*8(ap), s4, s1 C a4 b1 + adcx s8, s7 + adox s0, s9 + mulx 5*8(ap), s8, s0 C a5 b1 + adcx s4, s9 + adox s1, s2 + mulx 6*8(ap), s4, s1 C a6 b1 + adcx s8, s2 + adox s0, s6 + mulx 7*8(ap), s8, s0 C a7 b1, but we only use s8 + adcx s4, s6 + adox s1, sx + adc s8, sx + C 3, 5, 7, 9, 2, 6, x + + mov 2*8(bp), %rdx + test %al, %al + mulx 0*8(ap), s0, s1 C a0 b2 + mulx 1*8(ap), s4, s8 C a1 b2 + adcx s0, s3 + adox s1, s5 + mov s3, 2*8(rp) + mulx 2*8(ap), s0, s1 C a2 b2 + adcx s4, s5 + adox s8, s7 + mulx 3*8(ap), s4, s8 C a3 b2 + adcx s0, s7 + adox s1, s9 + mulx 4*8(ap), s0, s1 C a4 b2 + adcx s4, s9 + adox s8, s2 + mulx 5*8(ap), s4, s8 C a5 b2 + adcx s0, s2 + adox s1, s6 + mulx 6*8(ap), s0, s1 C a6 b2, but we only use s0 + adcx s4, s6 + adox s8, sx + adc s0, sx + C 5, 7, 9, 2, 6, x + + mov 3*8(bp), %rdx + test %al, %al + mulx 0*8(ap), s0, s1 C a0 b3 + mulx 1*8(ap), s3, s4 C a1 b3 + adcx s0, s5 + adox s1, s7 + mulx 2*8(ap), s0, s1 C a2 b3 + adcx s3, s7 + adox s4, s9 + mulx 3*8(ap), s3, s4 C a3 b3 + adcx s0, s9 + adox s1, s2 + mulx 4*8(ap), s0, s1 C a4 b3 + adcx s3, s2 + adox s4, s6 + mulx 5*8(ap), s3, s4 C a5 b3, but we only use s3 + mov s5, 3*8(rp) + adcx s0, s6 + adox s1, sx + adc s3, sx + C 7, 9, 2, 6, x + + mov 4*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b4 + mulx 1*8(ap), s3, s4 C a1 b4 + mulx 2*8(ap), s5, s8 C a2 b4 + add s0, s7 + adc $0, s1 + mov s7, 4*8(rp) + mulx 3*8(ap), s0, s7 C a3 b4 + imul 4*8(ap), %rdx C L(a4 b4) + C -, (1, 3), (4, 5), (8, 0), (7, rdx) + + add s1, s9 + adc s4, s2 + adc s8, s6 + adc s7, sx + add s3, s9 + adc s5, s2 + adc s0, s6 + adc %rdx, sx + C 9, 2, 6, x + + mov 5*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b5 + mulx 1*8(ap), s5, s4 C a1 b5 + mulx 2*8(ap), s3, s7 C a2 b5 + imul 3*8(ap), %rdx C L(a3 b5) + C 0, (1, 5), (4, 3), (7, rdx) + + add s0, s9 + adc s1, s2 + adc s4, s6 + adc s7, sx + mov s9, 5*8(rp) + add s5, s2 + adc s6, s3 + adc %rdx, sx + C 2, 3, x + + mov 6*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b6 + mulx 1*8(ap), s4, s5 C a1 b6 + imul 2*8(ap), %rdx C L(a2 b6) + C 0, (1, 4), (5, rdx) + + pop s9 + pop s8 + + add s0, s2 + adc s1, s3 + adc s5, sx + mov s2, 6*8(rp) + add s4, s3 + adc %rdx, sx + C 3, x + + mov 7*8(bp), %rdx + mulx 0*8(ap), s0, s1 C a0 b7 + imul 1*8(ap), %rdx C L(a1 b7) + C 0, (1, rdx) + + pop s7 + pop s6 + pop s5 + pop s4 + + add s0, s3 + adc s1, sx + mov s3, 7*8(rp) + add %rdx, sx + + ret +EPILOGUE()