diff --git a/.gitignore b/.gitignore
index f7fb28e8f7..ce3195baac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,5 @@ flint.pc
autom4te.cache/
config.m4
src/flint-mparam.h
+.gdb_history
+vgcore.*
diff --git a/Makefile.in b/Makefile.in
index b48e007b66..5a09cd6046 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -56,6 +56,7 @@ RM_F:=rm -f
RM_RF:=rm -rf
CP:=cp
CP_A:=cp -a
+GDB:=gdb
STATIC:=@STATIC@
SHARED:=@SHARED@
@@ -498,6 +499,16 @@ else
-include $(BUILD_DIR)/test/*.d
-include $(BUILD_DIR)/*/test/*.d
endif
+else ifeq ($(MAKECMDGOALS), debug)
+ifdef MOD
+$(warning Dependency tracking only set to cover the test executables of $(MOD).)
+-include $(foreach dir, $(MOD), $(BUILD_DIR)/$(dir)/test/*.d)
+else
+-include $(BUILD_DIR)/*/*.o.d
+-include $(BUILD_DIR)/*/*.lo.d
+-include $(BUILD_DIR)/test/*.d
+-include $(BUILD_DIR)/*/test/*.d
+endif
else ifeq ($(MAKECMDGOALS), tune)
-include $(BUILD_DIR)/*/*.o.d
-include $(BUILD_DIR)/*/*.lo.d
@@ -798,6 +809,25 @@ check: library $(TESTS:%=%_TEST_RUN)
@echo 'All tests passed.'
endif
+################################################################################
+# debugging
+################################################################################
+
+%_TEST_DBG_RUN_ARGS: %
+ @$(GDB) --args $< $(ARGS)
+
+ifdef MOD
+ifdef ARGS
+DEBUG:=1
+debug: library $(patsubst %,%_TEST_DBG_RUN_ARGS, $($(sort $(MOD))_TESTS))
+endif
+endif
+
+ifneq ($(DEBUG),1)
+debug:
+ $(error Can only run debugger with one module and one argument at a time)
+endif
+
################################################################################
# tuning
################################################################################
@@ -952,11 +982,11 @@ dist:
dev/make_dist.sh $(FLINT_VERSION)
################################################################################
-# debugging
+# makefile debugging
################################################################################
print-%:
@echo "$*=$($*)"
-.PHONY: all library shared static examples checkexamples profile tests check tune valgrind clean distclean install uninstall dist %_TEST_RUN %_VALGRIND_RUN print-% coverage coverage_html
-.SILENT: $(mpn_extras_S_SOURCES)
+.PHONY: all library shared static examples checkexamples profile tests check tune valgrind clean distclean install uninstall dist %_TEST_RUN %_TEST_RUN_% %_TEST_DGB_RUN_ARGS %_VALGRIND_RUN print-% coverage coverage_html debug
+.PRECIOUS: $(mpn_extras_PIC_S_SOURCES) $(mpn_extras_S_SOURCES)
diff --git a/src/mpn_extras.h b/src/mpn_extras.h
index 1c3a76f000..1d8420ccb1 100644
--- a/src/mpn_extras.h
+++ b/src/mpn_extras.h
@@ -370,6 +370,36 @@ flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n)
flint_mpn_mul((_z), (_y), (_yn), (_x), (_xn)); \
}
+/* Low multiplication ********************************************************/
+
+#define FLINT_HAVE_MULLOW_FUNC(n) ((n) <= FLINT_MPN_MULLOW_FUNC_TAB_WIDTH)
+
+FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[];
+
+mp_limb_t flint_mpn_mullow_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+
+#if FLINT_HAVE_ASSEMBLY_x86_64_adx
+# define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 8
+# define FLINT_HAVE_NATIVE_mpn_mullow_basecase 1
+#else
+# define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 0
+#endif
+
+/* TODO: Fix higher stuff */
+MPN_EXTRAS_INLINE
+mp_limb_t flint_mpn_mullow_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+ FLINT_ASSERT(n >= 1);
+
+ if (FLINT_HAVE_MULLOW_FUNC(n))
+ {
+ FLINT_ASSERT(rp != xp);
+ return flint_mpn_mullow_func_tab[n](rp, xp, yp);
+ }
+ else
+ return flint_mpn_mullow_basecase(rp, xp, yp, n);
+}
+
/* High multiplication *******************************************************/
#define FLINT_HAVE_MULHIGH_FUNC(n) ((n) <= FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH)
diff --git a/src/mpn_extras/mullow.c b/src/mpn_extras/mullow.c
new file mode 100644
index 0000000000..266dfd0a44
--- /dev/null
+++ b/src/mpn_extras/mullow.c
@@ -0,0 +1,57 @@
+/*
+ Copyright (C) 2024 Albin Ahlbäck
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "mpn_extras.h"
+
+#if FLINT_HAVE_ASSEMBLY_x86_64_adx
+mp_limb_t flint_mpn_mullow_1(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_2(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_3(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_4(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_5(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_6(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_7(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_8(mp_ptr, mp_srcptr, mp_srcptr);
+
+const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[] =
+{
+ NULL,
+ flint_mpn_mullow_1,
+ flint_mpn_mullow_2,
+ flint_mpn_mullow_3,
+ flint_mpn_mullow_4,
+ flint_mpn_mullow_5,
+ flint_mpn_mullow_6,
+ flint_mpn_mullow_7,
+ flint_mpn_mullow_8
+};
+#else
+const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[] = { NULL };
+#endif
+
+#if !FLINT_HAVE_NATIVE_mpn_mullow_basecase
+mp_limb_t
+flint_mpn_mullow_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+ mp_limb_t ret;
+ mp_size_t ix;
+
+ ret = mpn_mul_1(rp, xp, n, yp[0]);
+
+ for (ix = 1; ix < n; ix++)
+ {
+ ret += mpn_addmul_1(rp + ix, xp, n - ix, yp[ix]);
+ ret += xp[n - ix] * yp[ix];
+ }
+
+ return ret;
+}
+#endif
diff --git a/src/mpn_extras/profile/p-mullow.c b/src/mpn_extras/profile/p-mullow.c
new file mode 100644
index 0000000000..6a82befd61
--- /dev/null
+++ b/src/mpn_extras/profile/p-mullow.c
@@ -0,0 +1,74 @@
+/*
+ Copyright (C) 2024 Albin Ahlbäck
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "mpn_extras.h"
+#include "profiler.h"
+
+#define mpn_mullo_basecase __gmpn_mullo_basecase
+void mpn_mullo_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+
+#define N_MIN 1
+#define N_MAX 30
+
+#if N_MAX < 9
+# define P 1
+#elif N_MAX < 99
+# define P 2
+#elif N_MAX < 999
+# define P 3
+#elif N_MAX < 9999
+# define P 4
+#else
+# define P 5
+#endif
+
+#define _STR(x) #x
+#define STR(x) _STR(x)
+
+int
+main(void)
+{
+ mp_limb_t rp[2 * N_MAX];
+ mp_limb_t xp[N_MAX];
+ mp_limb_t yp[N_MAX];
+ mp_size_t n;
+
+ flint_printf("%.*s mullo_basecase / FLINT || mul / mullow\n", P, " ");
+ for (n = N_MIN; n <= N_MAX; n++)
+ {
+ double t1, t2, t3, FLINT_SET_BUT_UNUSED(__);
+ flint_printf("n = %" STR(P) "wd:", n);
+
+ mpn_random2(xp, n);
+ mpn_random2(yp, n);
+
+ TIMEIT_START
+ mpn_mullo_basecase(rp, xp, yp, n);
+ TIMEIT_STOP_VALUES(__, t1)
+
+ TIMEIT_START
+ flint_mpn_mul_n(rp, xp, yp, n);
+ TIMEIT_STOP_VALUES(__, t2)
+
+ TIMEIT_START
+ flint_mpn_mullow_n(rp, xp, yp, n);
+ TIMEIT_STOP_VALUES(__, t3)
+
+ flint_printf(" %7.2fx || %7.2f\n", t1 / t3, t2 / t3);
+ }
+
+ flint_cleanup_master();
+
+ return 0;
+}
+
+#undef N_MIN
+#undef N_MAX
diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c
index 0d8ab7c6ac..e28b927bd1 100644
--- a/src/mpn_extras/test/main.c
+++ b/src/mpn_extras/test/main.c
@@ -24,6 +24,7 @@
#include "t-mul.c"
#include "t-mul_n.c"
#include "t-mul_toom22.c"
+#include "t-mullow_n.c"
#include "t-mulhigh.c"
#include "t-mulhigh_normalised.c"
#include "t-mulmod_2expp1.c"
@@ -48,6 +49,7 @@ test_struct tests[] =
TEST_FUNCTION(flint_mpn_mul),
TEST_FUNCTION(flint_mpn_mul_n),
TEST_FUNCTION(flint_mpn_mul_toom22),
+ TEST_FUNCTION(flint_mpn_mullow_n),
TEST_FUNCTION(flint_mpn_mulhigh),
TEST_FUNCTION(flint_mpn_mulhigh_normalised),
TEST_FUNCTION(flint_mpn_mulmod_2expp1),
diff --git a/src/mpn_extras/test/t-mullow_n.c b/src/mpn_extras/test/t-mullow_n.c
new file mode 100644
index 0000000000..388fd330fb
--- /dev/null
+++ b/src/mpn_extras/test/t-mullow_n.c
@@ -0,0 +1,69 @@
+/*
+ Copyright (C) 2024 Albin Ahlbäck
+
+ This file is part of FLINT.
+
+ FLINT is free software: you can redistribute it and/or modify it under
+ the terms of the GNU Lesser General Public License (LGPL) as published
+ by the Free Software Foundation; either version 3 of the License, or
+ (at your option) any later version. See .
+*/
+
+#include "test_helpers.h"
+#include "mpn_extras.h"
+
+#define N_MIN 1
+#define N_MAX 30
+
+TEST_FUNCTION_START(flint_mpn_mullow_n, state)
+{
+ slong ix;
+ int result;
+
+ mp_ptr rp, rpf, xp, yp;
+
+ for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++)
+ {
+ mp_limb_t ret;
+ mp_size_t n;
+
+ n = N_MIN + n_randint(state, N_MAX - N_MIN + 1);
+
+ rp = flint_malloc(sizeof(mp_limb_t) * n);
+ rpf = flint_malloc(2 * sizeof(mp_limb_t) * n);
+ xp = flint_malloc(sizeof(mp_limb_t) * n);
+ yp = flint_malloc(sizeof(mp_limb_t) * n);
+
+ mpn_random2(xp, n);
+ mpn_random2(yp, n);
+
+ if (ix < 1)
+ continue;
+
+ ret = flint_mpn_mullow_n(rp, xp, yp, n);
+ flint_mpn_mul_n(rpf, xp, yp, n);
+
+ result = (mpn_cmp(rp, rpf, n) == 0 && ret == rpf[n]);
+ if (!result)
+ TEST_FUNCTION_FAIL(
+ "ix = %wd\n"
+ "n = %wd\n"
+ "xp = %{ulong*}\n"
+ "yp = %{ulong*}\n"
+ "Exp ret: %{ulong}\n"
+ "Got ret: %{ulong}\n"
+ "Expected: %{ulong*}\n"
+ "Got: %{ulong*}\n",
+ ix, n, xp, n, yp, n, rpf[n], ret, rpf, n, rp, n);
+
+ flint_free(rp);
+ flint_free(rpf);
+ flint_free(xp);
+ flint_free(yp);
+ }
+
+ TEST_FUNCTION_END(state);
+}
+
+#undef N_MIN
+#undef N_MAX
diff --git a/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm b/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm
new file mode 100644
index 0000000000..3fa98bee51
--- /dev/null
+++ b/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm
@@ -0,0 +1,484 @@
+dnl
+dnl Copyright 2017 Free Software Foundation, Inc.
+dnl Contributed to the GNU project by Torbjorn Granlund.
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version. See .
+dnl
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`config.m4')
+
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`bp_param', `%rdx')
+define(`n', `%rcx')
+
+define(`bp', `%r11')
+
+define(`jmpreg',`%r15')
+define(`nn', `%rbp')
+define(`mm', `%rbx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`s2', `%r10')
+define(`s3', `%r12')
+define(`s4', `%r13')
+define(`s5', `%r14')
+
+define(`sx', `%rax')
+
+dnl Scheme:
+dnl 0 1 2 3 4 5 6 7 8
+dnl 0 x x x x x x x x x
+dnl 1 x x x x x x x x l
+dnl 2 x x x x x x x l
+dnl 3 x x x x x x l
+dnl 4 x x x x x l
+dnl 5 x x x x l
+dnl 6 x x x l
+dnl 7 x x l
+dnl 8 x l
+
+dnl NOTE: Requires n > 8.
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(flint_mpn_mullow_basecase)
+ push mm
+ push nn
+ push s3
+ push s4
+ push s5
+ push jmpreg
+
+ lea -2(n), R32(nn)
+ lea 1*8(bp_param), bp C Prepare bp for addmul_1
+ mov 0*8(bp_param), %rdx C Load rdx for mul_1
+
+ mov R32(n), R32(sx)
+ shr $3, R32(n)
+ and $7, R32(sx) C clear OF, CF as side-effect
+ lea L(mtab)(%rip), s0
+ifdef(`PIC',
+` movslq (s0,sx,4), sx
+ lea (sx,s0), s0
+ jmp *s0
+',`
+ jmp *(s0,sx,8)
+')
+
+L(mf0): mulx 0*8(ap), s0, s2
+ lea 7*8(ap), ap
+ lea -1*8(rp), rp
+ lea L(f0)(%rip), jmpreg
+ jmp L(mb0)
+
+L(mf3): mulx 0*8(ap), s1, sx
+ lea 2*8(ap), ap
+ lea 2*8(rp), rp
+ inc R32(n)
+ lea L(f3)(%rip), jmpreg
+ jmp L(mb3)
+
+L(mf4): mulx 0*8(ap), s0, s2
+ lea 3*8(ap), ap
+ lea 3*8(rp), rp
+ inc R32(n)
+ lea L(f4)(%rip), jmpreg
+ jmp L(mb4)
+
+L(mf5): mulx 0*8(ap), s1, sx
+ lea 4*8(ap), ap
+ lea 4*8(rp), rp
+ inc R32(n)
+ lea L(f5)(%rip), jmpreg
+ jmp L(mb5)
+
+L(mf6): mulx 0*8(ap), s0, s2
+ lea 5*8(ap), ap
+ lea 5*8(rp), rp
+ inc R32(n)
+ lea L(f6)(%rip), jmpreg
+ jmp L(mb6)
+
+L(mf7): mulx 0*8(ap), s1, sx
+ lea 6*8(ap), ap
+ lea 6*8(rp), rp
+ inc R32(n)
+ lea L(f7)(%rip), jmpreg
+ jmp L(mb7)
+
+L(mf1): mulx 0*8(ap), s1, sx
+ lea L(f1)(%rip), jmpreg
+ jmp L(mb1)
+
+L(mf2): mulx 0*8(ap), s0, s2
+ lea 1*8(ap), ap
+ lea 1*8(rp), rp
+ lea L(f2)(%rip), jmpreg
+ mulx 0*8(ap), s1, sx
+
+ ALIGN(32)
+L(mtop):mov s0, -1*8(rp)
+ adc s2, s1
+L(mb1): mulx 1*8(ap), s0, s2
+ adc sx, s0
+ lea 8*8(ap), ap
+ mov s1, 0*8(rp)
+L(mb0): mov s0, 1*8(rp)
+ mulx -6*8(ap), s1, sx
+ lea 8*8(rp), rp
+ adc s2, s1
+L(mb7): mulx -5*8(ap), s0, s2
+ mov s1, -6*8(rp)
+ adc sx, s0
+L(mb6): mov s0, -5*8(rp)
+ mulx -4*8(ap), s1, sx
+ adc s2, s1
+L(mb5): mulx -3*8(ap), s0, s2
+ mov s1, -4*8(rp)
+ adc sx, s0
+L(mb4): mulx -2*8(ap), s1, sx
+ mov s0, -3*8(rp)
+ adc s2, s1
+L(mb3): mulx -1*8(ap), s0, s2
+ adc sx, s0
+ mov s1, -2*8(rp)
+ dec R32(n)
+ mulx 0*8(ap), s1, sx
+ jnz L(mtop)
+
+ lea 1*8(,nn,8), R32(mm)
+ mov s0, -1*8(rp)
+ adc s2, s1
+ mov 0*8(bp), %rdx
+ mov s1, 0*8(rp)
+ adc n, sx C n = 0
+ shr $3, R32(nn)
+ neg mm
+ or R32(nn), R32(n) C Reset n, clear OF and CF
+ lea 1*8(rp,mm), rp C Reset rp
+ lea (ap,mm), ap C Reset ap
+ jmp *jmpreg
+
+ nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;
+L(f7): mulx 0*8(ap), s0, s2
+ lea 5*8(ap), ap
+ lea -3*8(rp), rp
+ lea L(f6)(%rip), jmpreg
+ jmp L(b7)
+
+L(f6): mulx 0*8(ap), s1, s3
+ lea 4*8(ap), ap
+ lea -4*8(rp), rp
+ lea L(f5)(%rip), jmpreg
+ jmp L(b6)
+
+L(end): adox 0*8(rp), s1
+ mulx 1*8(ap), s0, s2 C Only s0 is used
+ lea 1*8(mm), mm
+ adox s3, sx
+ mov s1, 0*8(rp)
+ lea 1*8(bp), bp
+ adc s0, sx
+ lea (ap,mm), ap C Reset ap
+ mov 0*8(bp), %rdx
+ or R32(nn), R32(n) C Reset count, clear CF and OF
+ lea 1*8(rp,mm), rp C Reset rp
+ jmp *jmpreg
+
+L(f0): mulx 0*8(ap), s1, s3
+ lea -2*8(ap), ap
+ lea -2*8(rp), rp
+ lea L(f7)(%rip), jmpreg
+ jmp L(b0)
+
+L(f3): mulx 0*8(ap), s0, s2
+ lea 1*8(ap), ap
+ lea 1*8(rp), rp
+ mulx 0*8(ap), s1, s3
+ lea L(f2)(%rip), jmpreg
+
+ ALIGN(32)
+L(top): adox -1*8(rp), s0
+ adcx s2, s1
+ mov s0, -1*8(rp)
+ jrcxz L(end)
+L(b2): mulx 1*8(ap), s0, s2
+ adox 0*8(rp), s1
+ lea -1(n), R32(n)
+ mov s1, 0*8(rp)
+ adcx s3, s0
+L(b1): mulx 2*8(ap), s1, s3
+ adcx s2, s1
+ adox 1*8(rp), s0
+ mov s0, 1*8(rp)
+L(b0): mulx 3*8(ap), s0, s2
+ lea 8*8(ap), ap
+ adcx s3, s0
+ adox 2*8(rp), s1
+ mov s1, 2*8(rp)
+L(b7): mulx -4*8(ap), s1, s3
+ adox 3*8(rp), s0
+ adcx s2, s1
+ mov s0, 3*8(rp)
+L(b6): mulx -3*8(ap), s0, s2
+ adcx s3, s0
+ adox 4*8(rp), s1
+ mov s1, 4*8(rp)
+L(b5): mulx -2*8(ap), s1, s3
+ adox 5*8(rp), s0
+ adcx s2, s1
+ mov s0, 5*8(rp)
+L(b4): adox 6*8(rp), s1
+ mulx -1*8(ap), s0, s2
+ mov s1, 6*8(rp)
+ lea 8*8(rp), rp
+ adcx s3, s0
+ mulx 0*8(ap), s1, s3
+ jmp L(top)
+
+L(f5): mulx 0*8(ap), s0, s2
+ lea 3*8(ap), ap
+ lea -5*8(rp), rp
+ lea L(f4)(%rip), jmpreg
+ jmp L(b5)
+
+L(f4): mulx 0*8(ap), s1, s3
+ lea 2*8(ap), ap
+ lea -6*8(rp), rp
+ lea L(f3)(%rip), jmpreg
+ jmp L(b4)
+
+L(f2): mulx 0*8(ap), s1, s3
+ lea -1(nn), R32(nn)
+ lea L(f1)(%rip), jmpreg
+ jmp L(b2)
+
+L(f1): mulx 0*8(ap), s0, s2
+ jrcxz L(cor)
+ lea -1*8(ap), ap
+ lea -1*8(rp), rp
+ lea L(f0)(%rip), jmpreg
+ jmp L(b1)
+
+define(`t0', `s0')
+define(`t1', `s2')
+define(`t2', `s1')
+define(`t3', `s3')
+define(`t4', `s4')
+define(`t5', `s5')
+define(`t6', `jmpreg')
+define(`t7', `mm')
+define(`t8', `nn')
+define(`t9', `n')
+define(`tx', `sx')
+L(cor): mulx 1*8(ap), t2, t3
+ mulx 2*8(ap), t4, t5
+ adcx 0*8(rp), t0
+ adox t1, t2
+ mulx 3*8(ap), t6, t7
+ adcx 1*8(rp), t2
+ adox t3, t4
+ mulx 4*8(ap), t8, t9
+ adcx 2*8(rp), t4
+ adox t5, t6
+ mulx 5*8(ap), t1, t3
+ adcx 3*8(rp), t6
+ adox t7, t8
+ mulx 6*8(ap), t5, t7
+ mov t0, 0*8(rp)
+ adcx 4*8(rp), t8
+ adox t9, t1
+ mulx 7*8(ap), t0, t9
+ adcx 5*8(rp), t1
+ adox t3, t5
+ mulx 8*8(ap), t3, %rdx C %rdx unused
+ adcx 6*8(rp), t5
+ adox t7, t0
+ adcx 7*8(rp), t0
+ adox t9, tx
+ C 2, 4, 6, 8, 1, 5, 0, x
+
+ mov 1*8(bp), %rdx
+ mulx 0*8(ap), t7, t9
+ adc t3, tx
+ test %al, %al C Reset OF and CF
+ adcx t7, t2
+ adox t9, t4
+ mov t2, 1*8(rp)
+ mulx 1*8(ap), t7, t9
+ mulx 2*8(ap), t2, t3
+ adcx t7, t4
+ adox t9, t6
+ mulx 3*8(ap), t7, t9
+ adcx t2, t6
+ adox t3, t8
+ mulx 4*8(ap), t2, t3
+ adcx t7, t8
+ adox t9, t1
+ mulx 5*8(ap), t7, t9
+ adcx t2, t1
+ adox t3, t5
+ mulx 6*8(ap), t2, t3
+ adcx t7, t5
+ adox t9, t0
+ mulx 7*8(ap), t7, t9 C t9 unused
+ adcx t2, t0
+ adox t3, tx
+ C 4, 6, 8, 1, 5, 0, x
+
+ mov 2*8(bp), %rdx
+ mulx 0*8(ap), t2, t3
+ adc t7, tx
+ test %al, %al
+ mulx 1*8(ap), t7, t9
+ adcx t2, t4
+ adox t3, t6
+ mov t4, 2*8(rp)
+ mulx 2*8(ap), t2, t3
+ adcx t7, t6
+ adox t9, t8
+ mulx 3*8(ap), t7, t9
+ adcx t2, t8
+ adox t3, t1
+ mulx 4*8(ap), t2, t3
+ adcx t7, t1
+ adox t9, t5
+ mulx 5*8(ap), t7, t9
+ adcx t2, t5
+ adox t3, t0
+ mulx 6*8(ap), t2, t3 C t3 unused
+ adcx t7, t0
+ adox t9, tx
+ C 6, 8, 1, 5, 0, x
+
+ mov 3*8(bp), %rdx
+ mulx 0*8(ap), t4, t7
+ adc t2, tx
+ test %al, %al
+ mulx 1*8(ap), t2, t3
+ adcx t4, t6
+ adox t7, t8
+ mov t6, 3*8(rp)
+ mulx 2*8(ap), t4, t7
+ mulx 3*8(ap), t6, t9
+ adcx t2, t8
+ adox t3, t1
+ mulx 4*8(ap), t2, t3
+ adcx t4, t1
+ adox t7, t5
+ mulx 5*8(ap), t4, t7 C t7 unused
+ adcx t6, t5
+ adox t9, t0
+ adcx t2, t0
+ adox t3, tx
+ C 8, 1, 5, 0, x
+
+ mov 4*8(bp), %rdx
+ mulx 0*8(ap), t2, t3
+ adc t4, tx
+ test %al, %al
+ mulx 1*8(ap), t4, t6
+ mulx 2*8(ap), t7, t9
+ adcx t2, t8
+ adox t3, t1
+ mulx 3*8(ap), t2, t3
+ adcx t4, t1
+ adox t6, t5
+ mulx 4*8(ap), t4, t6 C t6 unused
+ adcx t7, t5
+ adox t9, t0
+ mov t8, 4*8(rp)
+ adox t3, tx
+ adc t2, t0
+ C 1, 5, 0, x
+
+ mov 5*8(bp), %rdx
+ mulx 0*8(ap), t2, t3
+ adc t4, tx
+ mulx 1*8(ap), t4, t6
+ mulx 2*8(ap), t7, t8
+ imul 3*8(ap), %rdx
+ add t2, t1
+ adc t3, t5
+ mov t1, 5*8(rp)
+ adc t6, t0
+ adc t8, tx
+ add t4, t5
+ adc t7, t0
+ adc %rdx, tx
+ C 5, 0, x
+
+ mov 6*8(bp), %rdx
+ mulx 0*8(ap), t1, t2
+ mulx 1*8(ap), t3, t4
+ imul 2*8(ap), %rdx
+ add t1, t5
+ adc t2, t0
+ mov t5, 6*8(rp)
+ adc t4, tx
+ add t3, t0
+ adc %rdx, tx
+ C 0, x
+
+ mov 7*8(bp), %rdx
+ mulx 0*8(ap), t1, t2
+ imul 1*8(ap), %rdx
+
+ pop jmpreg
+ pop s5
+ pop s4
+ pop s3
+ pop nn
+ pop mm
+
+ add t1, t0
+ adc t2, tx
+ mov t0, 7*8(rp)
+ add %rdx, tx
+
+ ret
+EPILOGUE()
+ JUMPTABSECT
+ ALIGN(8)
+L(mtab):JMPENT( L(mf0), L(mtab))
+ JMPENT( L(mf1), L(mtab))
+ JMPENT( L(mf2), L(mtab))
+ JMPENT( L(mf3), L(mtab))
+ JMPENT( L(mf4), L(mtab))
+ JMPENT( L(mf5), L(mtab))
+ JMPENT( L(mf6), L(mtab))
+ JMPENT( L(mf7), L(mtab))
diff --git a/src/mpn_extras/x86_64/broadwell/mullow_hard.asm b/src/mpn_extras/x86_64/broadwell/mullow_hard.asm
new file mode 100644
index 0000000000..24d0e92693
--- /dev/null
+++ b/src/mpn_extras/x86_64/broadwell/mullow_hard.asm
@@ -0,0 +1,744 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version. See .
+dnl
+
+include(`config.m4')
+
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`bp_param', `%rdx')
+
+define(`bp', `%r8')
+
+define(`s0', `%rcx')
+define(`s1', `%r9')
+define(`s2', `%r10')
+define(`s3', `%r11')
+define(`s4', `%rbx')
+define(`s5', `%rbp')
+define(`s6', `%r12')
+define(`s7', `%r13')
+define(`s8', `%r14')
+define(`s9', `%r15')
+
+define(`sx', `%rax')
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_mullow_1)
+ mov 0*8(bp_param), %rdx
+ mulx 0*8(ap), s0, sx C a0 b0
+ mov s0, 0*8(rp)
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_mullow_2)
+ mov 1*8(bp_param), sx
+ mov 0*8(bp_param), %rdx
+
+ mulx 0*8(ap), s0, s1 C a0 b0
+ mulx 1*8(ap), s2, s3 C a1 b0
+ mov sx, %rdx
+ mulx 0*8(ap), bp_param, bp C a0 b1
+ imul 1*8(ap), sx C L(a1 b1)
+ C s0, (s1, s2, bp_param), (s3, bp, sx)
+
+ mov s0, 0*8(rp)
+
+ add s2, s1
+ adc s3, bp
+ C (s1, bp_param), (bp, sx)
+
+ add bp_param, s1
+ adc bp, sx
+ C s1, sx
+
+ mov s1, 1*8(rp)
+
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_mullow_3)
+ push s4
+ push s5
+
+ mov bp_param, bp
+
+ mov 0*8(bp_param), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b0
+ mulx 1*8(ap), s2, s3 C a1 b0
+ mulx 2*8(ap), s4, sx C a2 b0
+ C 0, (1, 2), (3, 4), x
+
+ mov s0, 0*8(rp)
+ add s2, s1
+
+ mov 1*8(bp), %rdx
+ mulx 0*8(ap), s0, s2 C a0 b1
+ adc s4, s3
+ adc $0, sx
+ mulx 1*8(ap), s4, s5 C a1 b1
+ imul 2*8(ap), %rdx C L(a2 b1)
+ C 0, (2, 4), (5, rdx)
+
+ add s0, s1
+ adc s2, s3
+ mov s1, 1*8(rp)
+ adc %rdx, sx
+
+ mov 2*8(bp), %rdx
+ mulx 0*8(ap), s0, s2 C a0 b2
+ imul 1*8(ap), %rdx C L(a1 b2)
+ C 0, (2, rdx)
+
+ add s4, s3
+ adc s5, sx
+
+ pop s5
+ pop s4
+
+ add s2, sx
+ add s0, s3
+ adc %rdx, sx
+ mov s3, 2*8(rp)
+
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_mullow_4)
+ mov bp_param, bp
+
+ push s4
+ push s5
+ push s6
+ push s7
+ push s8
+
+ mov 0*8(bp_param), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b0
+ mulx 1*8(ap), s2, s3 C a1 b0
+ mulx 2*8(ap), s4, s5 C a2 b0
+ mulx 3*8(ap), s6, sx C a3 b0
+ C 0, (1, 2), (3, 4), (5, 6), x
+
+ mov s0, 0*8(rp)
+ add s2, s1
+ adc s4, s3
+ adc s6, s5
+ adc $0, sx
+ C 1, 3, 5, x
+
+ mov 1*8(bp), %rdx
+ mulx 0*8(ap), s0, s2 C a0 b1
+ mulx 1*8(ap), s4, s6 C a1 b1
+ mulx 2*8(ap), s7, s8 C a2 b1
+ imul 3*8(ap), %rdx C L(a3 b1)
+ C 0, (2, 4), (6, 7), (8, rdx)
+
+ add s0, s1
+ adc s2, s3
+ adc s6, s5
+ adc s8, sx
+ mov s1, 1*8(rp)
+ add s4, s3
+ adc s7, s5
+ adc %rdx, sx
+ C 3, 5, x
+
+ mov 2*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b2
+ mulx 1*8(ap), s2, s4 C a1 b2
+ imul 2*8(ap), %rdx C L(a2 b2)
+ C 0, (1, 2), (4, rdx)
+
+ add s0, s3
+ adc s1, s5
+ mov s3, 2*8(rp)
+ adc s4, sx
+ add s5, s2
+ adc %rdx, sx
+ C 2, x
+
+ mov 3*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b3
+ imul 1*8(ap), %rdx C L(a1 b3)
+ C 0, (1, rdx)
+
+ pop s8
+ pop s7
+ pop s6
+ pop s5
+ pop s4
+
+ add s0, s2
+ adc s1, sx
+ mov s2, 3*8(rp)
+ add %rdx, sx
+
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_mullow_5)
+ mov bp_param, bp
+
+ push s4
+ push s5
+ push s6
+ push s7
+ push s8
+ push s9
+
+ mov 0*8(bp_param), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b0
+ mulx 1*8(ap), s2, s3 C a1 b0
+ mulx 2*8(ap), s4, s5 C a2 b0
+ mulx 3*8(ap), s6, s7 C a3 b0
+ mulx 4*8(ap), s8, sx C a4 b0
+ C 0, (1, 2), (3, 4), (5, 6), (7, 8), x
+
+ mov s0, 0*8(rp)
+ add s2, s1
+ adc s4, s3
+ adc s6, s5
+ adc s8, s7
+ adc $0, sx
+ C 1, 3, 5, 7, x
+
+ mov 1*8(bp), %rdx
+ mulx 0*8(ap), s0, s2 C a0 b1
+ mulx 1*8(ap), s4, s6 C a1 b1
+ mulx 2*8(ap), s8, s9 C a2 b1
+ add s0, s1
+ adc s2, s3
+ mov s1, 1*8(rp)
+ mulx 3*8(ap), s0, s2 C a3 b1
+ mulx 4*8(ap), %rdx, s1 C a4 b1, but we only use %rdx
+ C -, (-, 4), (6, 8), (9, 0), (2, %rdx)
+
+ adc s6, s5
+ adc s9, s7
+ adc s2, sx
+ add s4, s3
+ adc s8, s5
+ adc s0, s7
+ adc %rdx, sx
+ C 3, 5, 7, x
+
+ mov 2*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b2
+ mulx 1*8(ap), s2, s4 C a1 b2
+ mulx 2*8(ap), s6, s8 C a2 b2
+ imul 3*8(ap), %rdx C L(a3 b2)
+ C 0, (1, 2), (4, 6), (8, rdx)
+
+ add s0, s3
+ adc s1, s5
+ adc s4, s7
+ adc s8, sx
+ add s2, s5
+ adc s6, s7
+ adc %rdx, sx
+ mov s3, 2*8(rp)
+ C 5, 7, x
+
+ mov 3*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b3
+ mulx 1*8(ap), s2, s3 C a1 b3
+ imul 2*8(ap), %rdx C L(a2 b3)
+ C 0, (1, 2), (3, rdx)
+
+ add s0, s5
+ adc s1, s7
+ mov s5, 3*8(rp)
+ adc s3, sx
+ add s7, s2
+ adc %rdx, sx
+ C 2, x
+
+ mov 4*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b4
+ imul 1*8(ap), %rdx C L(a1 b4)
+ C 0, (1, rdx)
+
+ pop s9
+ pop s8
+ pop s7
+ pop s6
+ pop s5
+ pop s4
+
+ add s0, s2
+ adc s1, sx
+ mov s2, 4*8(rp)
+ add %rdx, sx
+
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_mullow_6)
+ mov bp_param, bp
+
+ push s4
+ push s5
+ push s6
+ push s7
+ push s8
+ push s9
+
+ mov 0*8(bp_param), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b0
+ mulx 1*8(ap), s2, s3 C a1 b0
+ mulx 2*8(ap), s4, s5 C a2 b0
+ mulx 3*8(ap), s6, s7 C a3 b0
+ mulx 4*8(ap), s8, s9 C a4 b0
+ mulx 5*8(ap), %rdx, sx C a5 b0
+ C 0, (1, 2), (3, 4), (5, 6), (7, 8), (9, rdx), x
+
+ add s2, s1
+ adc s4, s3
+ adc s6, s5
+ mov s0, 0*8(rp)
+ adc s8, s7
+ adc %rdx, s9
+ adc $0, sx
+ C 1, 3, 5, 7, 9, x
+
+ test %al, %al
+ mov 1*8(bp), %rdx
+ mulx 0*8(ap), s2, s4 C a0 b1
+ mulx 1*8(ap), s6, s8 C a1 b1
+ adcx s2, s1
+ adox s4, s3
+ mov s1, 1*8(rp)
+ mulx 2*8(ap), s2, s4 C a2 b1
+ adcx s6, s3
+ adox s8, s5
+ mulx 3*8(ap), s6, s8 C a3 b1
+ adcx s2, s5
+ adox s4, s7
+ mulx 4*8(ap), s2, s4 C a4 b1
+ adcx s6, s7
+ adox s8, s9
+ mulx 5*8(ap), %rdx, s6 C a5 b1, but we only use %rdx
+ C -, (-, -), (-, -), (-, -), (-, 2), (4, %rdx)
+
+ adcx s2, s9
+ adox s4, sx
+ adc %rdx, sx
+ C 3, 5, 7, 9, x
+
+ mov 2*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b2
+ mulx 1*8(ap), s2, s4 C a1 b2
+ mulx 2*8(ap), s6, s8 C a2 b2
+ add s0, s3
+ adc s1, s5
+ mov s3, 2*8(rp)
+ mulx 3*8(ap), s0, s1 C a3 b2
+ mulx 4*8(ap), %rdx, s3 C a4 b2, but we only use %rdx
+ C -, (-, 2), (4, 6), (8, 0), (1, rdx)
+
+ adc s4, s7
+ adc s8, s9
+ adc s1, sx
+ add s2, s5
+ adc s6, s7
+ adc s0, s9
+ adc %rdx, sx
+ C 5, 7, 9, x
+
+ mov 3*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b3
+ mulx 1*8(ap), s2, s3 C a1 b3
+ mulx 2*8(ap), s4, s6 C a2 b3
+ imul 3*8(ap), %rdx C L(a3 b3)
+ C 0, (1, 2), (3, 4), (6, rdx)
+
+ add s0, s5
+ adc s1, s7
+ adc s3, s9
+ mov s5, 3*8(rp)
+ adc s6, sx
+ add s2, s7
+ adc s4, s9
+ adc %rdx, sx
+ C 7, 9, x
+
+ mov 4*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b4
+ mulx 1*8(ap), s2, s3 C a1 b4
+ imul 2*8(ap), %rdx C L(a2 b4)
+ C 0, (1, 2), (3, rdx)
+
+ add s0, s7
+ adc s1, s9
+ adc s3, sx
+ mov s7, 4*8(rp)
+ add s9, s2
+ adc %rdx, sx
+ C 2, x
+
+ mov 5*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b5
+ imul 1*8(ap), %rdx C L(a1 b5)
+ C 0, (1, rdx)
+
+ pop s9
+ pop s8
+ pop s7
+ pop s6
+ pop s5
+ pop s4
+
+ add s0, s2
+ adc s1, sx
+ mov s2, 5*8(rp)
+ add %rdx, sx
+
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_mullow_7)
+ mov bp_param, bp
+
+ push s4
+ push s5
+ push s6
+ push s7
+ push s8
+ push s9
+
+ mov 0*8(bp_param), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b0
+ mulx 1*8(ap), s2, s3 C a1 b0
+ mulx 2*8(ap), s4, s5 C a2 b0
+ mulx 3*8(ap), s6, s7 C a3 b0
+ mov s0, 0*8(rp)
+ mulx 4*8(ap), s8, s9 C a4 b0
+ add s2, s1
+ adc s4, s3
+ mulx 5*8(ap), s0, s2 C a5 b0
+ adc s6, s5
+ adc s8, s7
+ mulx 6*8(ap), s4, sx C a6 b0
+ C 0, (1, -), (3, -), (5, -), (7, -), (9, 0), (2, 4), x
+
+ adc s0, s9
+ adc s4, s2
+ mov 1*8(bp), %rdx
+ adc $0, sx
+ C 1, 3, 5, 7, 9, 2, x
+
+ test %al, %al
+ mulx 0*8(ap), s0, s4 C a0 b1
+ mulx 1*8(ap), s6, s8 C a1 b1
+ adcx s0, s1
+ adox s4, s3
+ mulx 2*8(ap), s0, s4 C a2 b1
+ adcx s6, s3
+ adox s8, s5
+ mulx 3*8(ap), s6, s8 C a3 b1
+ adcx s0, s5
+ adox s4, s7
+ mulx 4*8(ap), s0, s4 C a4 b1
+ adcx s6, s7
+ adox s8, s9
+ mulx 5*8(ap), s6, s8 C a5 b1
+ adcx s0, s9
+ adox s4, s2
+ mulx 6*8(ap), %rdx, s0 C a6 b1, but we only use %rdx
+ adcx s6, s2
+ adox s8, sx
+ mov s1, 1*8(rp)
+ adc %rdx, sx
+ C 3, 5, 7, 9, 2, x
+
+ mov 2*8(bp), %rdx
+ test %al, %al
+ mulx 0*8(ap), s0, s1 C a0 b2
+ mulx 1*8(ap), s4, s6 C a1 b2
+ adcx s0, s3
+ adox s1, s5
+ mov s3, 2*8(rp)
+ mulx 2*8(ap), s0, s1 C a2 b2
+ adcx s4, s5
+ adox s6, s7
+ mulx 3*8(ap), s4, s6 C a3 b2
+ adcx s0, s7
+ adox s1, s9
+ mulx 4*8(ap), s0, s1 C a4 b2
+ adcx s4, s9
+ adox s6, s2
+ mulx 5*8(ap), %rdx, s3 C a5 b2, but we only use %rdx
+ adcx s0, s2
+ adox s1, sx
+ adc %rdx, sx
+ C 5, 7, 9, 2, x
+
+ mov 3*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b3
+ mulx 1*8(ap), s3, s4 C a1 b3
+ mulx 2*8(ap), s6, s8 C a2 b3
+ add s0, s5
+ adc s3, s1
+ adc $0, s4
+ mulx 3*8(ap), s0, s3 C a3 b3
+ imul 4*8(ap), %rdx C L(a4 b3)
+ C -, (1, -), (4, 6), (8, 0), (3, rdx)
+
+ mov s5, 3*8(rp)
+ add s1, s7
+ adc s4, s9
+ adc s8, s2
+ adc s3, sx
+ add s6, s9
+ adc s0, s2
+ adc %rdx, sx
+ C 7, 9, 2, x
+
+ mov 4*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b4
+ mulx 1*8(ap), s3, s4 C a1 b4
+ mulx 2*8(ap), s5, s6 C a2 b4
+ imul 3*8(ap), %rdx C L(a3 b4)
+ C 0, (1, 3), (4, 5), (6, rdx)
+
+ add s0, s7
+ adc s1, s9
+ adc s4, s2
+ mov s7, 4*8(rp)
+ adc s6, sx
+ add s9, s3
+ adc s5, s2
+ adc %rdx, sx
+ C 3, 2, x
+
+ mov 5*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b5
+ mulx 1*8(ap), s4, s5 C a1 b5
+ imul 2*8(ap), %rdx C L(a2 b5)
+ C 0, (1, 4), (5, rdx)
+
+ pop s9
+ pop s8
+
+ add s0, s3
+ adc s1, s2
+ adc s5, sx
+ mov s3, 5*8(rp)
+ add s4, s2
+ adc %rdx, sx
+ C 2, x
+
+ mov 6*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b6
+ imul 1*8(ap), %rdx C L(a1 b6)
+ C 0, (1, rdx)
+
+ pop s7
+ pop s6
+ pop s5
+ pop s4
+
+ add s0, s2
+ adc s1, sx
+ mov s2, 6*8(rp)
+ add %rdx, sx
+
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(flint_mpn_mullow_8)
+ mov bp_param, bp
+
+ push s4
+ push s5
+ push s6
+ push s7
+ push s8
+ push s9
+
+ mov 0*8(bp_param), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b0
+ mulx 1*8(ap), s2, s3 C a1 b0
+ mulx 2*8(ap), s4, s5 C a2 b0
+ mulx 3*8(ap), s6, s7 C a3 b0
+ mov s0, 0*8(rp)
+ mulx 4*8(ap), s8, s9 C a4 b0
+ add s2, s1
+ adc s4, s3
+ mulx 5*8(ap), s0, s2 C a5 b0
+ adc s6, s5
+ adc s8, s7
+ mulx 6*8(ap), s4, s6 C a6 b0
+ mulx 7*8(ap), s8, sx C a7 b0
+ C -, (1, -), (3, -), (5, -), (7, -), (9, 0), (2, 4), (6, 8), x
+
+ adc s0, s9
+ adc s4, s2
+ adc s8, s6
+ mov 1*8(bp), %rdx
+ adc $0, sx
+ C 1, 3, 5, 7, 9, 2, 6, x
+
+ test %al, %al
+ mulx 0*8(ap), s0, s4 C a0 b1
+ adcx s0, s1
+ adox s4, s3
+ mov s1, 1*8(rp)
+ mulx 1*8(ap), s8, s0 C a1 b1
+ mulx 2*8(ap), s4, s1 C a2 b1
+ adcx s8, s3
+ adox s0, s5
+ mulx 3*8(ap), s8, s0 C a3 b1
+ adcx s4, s5
+ adox s1, s7
+ mulx 4*8(ap), s4, s1 C a4 b1
+ adcx s8, s7
+ adox s0, s9
+ mulx 5*8(ap), s8, s0 C a5 b1
+ adcx s4, s9
+ adox s1, s2
+ mulx 6*8(ap), s4, s1 C a6 b1
+ adcx s8, s2
+ adox s0, s6
+ mulx 7*8(ap), s8, s0 C a7 b1, but we only use s8
+ adcx s4, s6
+ adox s1, sx
+ adc s8, sx
+ C 3, 5, 7, 9, 2, 6, x
+
+ mov 2*8(bp), %rdx
+ test %al, %al
+ mulx 0*8(ap), s0, s1 C a0 b2
+ mulx 1*8(ap), s4, s8 C a1 b2
+ adcx s0, s3
+ adox s1, s5
+ mov s3, 2*8(rp)
+ mulx 2*8(ap), s0, s1 C a2 b2
+ adcx s4, s5
+ adox s8, s7
+ mulx 3*8(ap), s4, s8 C a3 b2
+ adcx s0, s7
+ adox s1, s9
+ mulx 4*8(ap), s0, s1 C a4 b2
+ adcx s4, s9
+ adox s8, s2
+ mulx 5*8(ap), s4, s8 C a5 b2
+ adcx s0, s2
+ adox s1, s6
+ mulx 6*8(ap), s0, s1 C a6 b2, but we only use s0
+ adcx s4, s6
+ adox s8, sx
+ adc s0, sx
+ C 5, 7, 9, 2, 6, x
+
+ mov 3*8(bp), %rdx
+ test %al, %al
+ mulx 0*8(ap), s0, s1 C a0 b3
+ mulx 1*8(ap), s3, s4 C a1 b3
+ adcx s0, s5
+ adox s1, s7
+ mulx 2*8(ap), s0, s1 C a2 b3
+ adcx s3, s7
+ adox s4, s9
+ mulx 3*8(ap), s3, s4 C a3 b3
+ adcx s0, s9
+ adox s1, s2
+ mulx 4*8(ap), s0, s1 C a4 b3
+ adcx s3, s2
+ adox s4, s6
+ mulx 5*8(ap), s3, s4 C a5 b3, but we only use s3
+ mov s5, 3*8(rp)
+ adcx s0, s6
+ adox s1, sx
+ adc s3, sx
+ C 7, 9, 2, 6, x
+
+ mov 4*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b4
+ mulx 1*8(ap), s3, s4 C a1 b4
+ mulx 2*8(ap), s5, s8 C a2 b4
+ add s0, s7
+ adc $0, s1
+ mov s7, 4*8(rp)
+ mulx 3*8(ap), s0, s7 C a3 b4
+ imul 4*8(ap), %rdx C L(a4 b4)
+ C -, (1, 3), (4, 5), (8, 0), (7, rdx)
+
+ add s1, s9
+ adc s4, s2
+ adc s8, s6
+ adc s7, sx
+ add s3, s9
+ adc s5, s2
+ adc s0, s6
+ adc %rdx, sx
+ C 9, 2, 6, x
+
+ mov 5*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b5
+ mulx 1*8(ap), s5, s4 C a1 b5
+ mulx 2*8(ap), s3, s7 C a2 b5
+ imul 3*8(ap), %rdx C L(a3 b5)
+ C 0, (1, 5), (4, 3), (7, rdx)
+
+ add s0, s9
+ adc s1, s2
+ adc s4, s6
+ adc s7, sx
+ mov s9, 5*8(rp)
+ add s5, s2
+ adc s6, s3
+ adc %rdx, sx
+ C 2, 3, x
+
+ mov 6*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b6
+ mulx 1*8(ap), s4, s5 C a1 b6
+ imul 2*8(ap), %rdx C L(a2 b6)
+ C 0, (1, 4), (5, rdx)
+
+ pop s9
+ pop s8
+
+ add s0, s2
+ adc s1, s3
+ adc s5, sx
+ mov s2, 6*8(rp)
+ add s4, s3
+ adc %rdx, sx
+ C 3, x
+
+ mov 7*8(bp), %rdx
+ mulx 0*8(ap), s0, s1 C a0 b7
+ imul 1*8(ap), %rdx C L(a1 b7)
+ C 0, (1, rdx)
+
+ pop s7
+ pop s6
+ pop s5
+ pop s4
+
+ add s0, s3
+ adc s1, sx
+ mov s3, 7*8(rp)
+ add %rdx, sx
+
+ ret
+EPILOGUE()