diff --git a/.gitignore b/.gitignore
index f7fb28e8f7..ce3195baac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,5 @@ flint.pc
 autom4te.cache/
 config.m4
 src/flint-mparam.h
+.gdb_history
+vgcore.*
diff --git a/Makefile.in b/Makefile.in
index b48e007b66..5a09cd6046 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -56,6 +56,7 @@ RM_F:=rm -f
 RM_RF:=rm -rf
 CP:=cp
 CP_A:=cp -a
+GDB:=gdb
 
 STATIC:=@STATIC@
 SHARED:=@SHARED@
@@ -498,6 +499,16 @@ else
 -include $(BUILD_DIR)/test/*.d
 -include $(BUILD_DIR)/*/test/*.d
 endif
+else ifeq ($(MAKECMDGOALS), debug)
+ifdef MOD
+$(warning Dependency tracking only set to cover the test executables of $(MOD).)
+-include $(foreach dir, $(MOD), $(BUILD_DIR)/$(dir)/test/*.d)
+else
+-include $(BUILD_DIR)/*/*.o.d
+-include $(BUILD_DIR)/*/*.lo.d
+-include $(BUILD_DIR)/test/*.d
+-include $(BUILD_DIR)/*/test/*.d
+endif
 else ifeq ($(MAKECMDGOALS), tune)
 -include $(BUILD_DIR)/*/*.o.d
 -include $(BUILD_DIR)/*/*.lo.d
@@ -798,6 +809,25 @@ check: library $(TESTS:%=%_TEST_RUN)
 	@echo 'All tests passed.'
 endif
 
+################################################################################
+# debugging
+################################################################################
+
+%_TEST_DBG_RUN_ARGS: %
+	@$(GDB) --args $< $(ARGS)
+
+ifdef MOD
+ifdef ARGS
+DEBUG:=1
+debug: library $(patsubst %,%_TEST_DBG_RUN_ARGS, $($(sort $(MOD))_TESTS))
+endif
+endif
+
+ifneq ($(DEBUG),1)
+debug:
+	$(error Can only run debugger with one module and one argument at a time)
+endif
+
 ################################################################################
 # tuning
 ################################################################################
@@ -952,11 +982,11 @@ dist:
 	dev/make_dist.sh $(FLINT_VERSION)
 
 ################################################################################
-# debugging
+# makefile debugging
 ################################################################################
 
 print-%:
 	@echo "$*=$($*)"
 
-.PHONY: all library shared static examples checkexamples profile tests check tune valgrind clean distclean install uninstall dist %_TEST_RUN %_VALGRIND_RUN print-% coverage coverage_html
-.SILENT: $(mpn_extras_S_SOURCES)
+.PHONY: all library shared static examples checkexamples profile tests check tune valgrind clean distclean install uninstall dist %_TEST_RUN %_TEST_RUN_% %_TEST_DGB_RUN_ARGS %_VALGRIND_RUN print-% coverage coverage_html debug
+.PRECIOUS: $(mpn_extras_PIC_S_SOURCES) $(mpn_extras_S_SOURCES)
diff --git a/src/mpn_extras.h b/src/mpn_extras.h
index 1c3a76f000..1d8420ccb1 100644
--- a/src/mpn_extras.h
+++ b/src/mpn_extras.h
@@ -370,6 +370,36 @@ flint_mpn_sqr(mp_ptr r, mp_srcptr x, mp_size_t n)
         flint_mpn_mul((_z), (_y), (_yn), (_x), (_xn)); \
     }
 
+/* Low multiplication ********************************************************/
+
+#define FLINT_HAVE_MULLOW_FUNC(n) ((n) <= FLINT_MPN_MULLOW_FUNC_TAB_WIDTH)
+
+FLINT_DLL extern const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[];
+
+mp_limb_t flint_mpn_mullow_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+
+#if FLINT_HAVE_ASSEMBLY_x86_64_adx
+# define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 8
+# define FLINT_HAVE_NATIVE_mpn_mullow_basecase 1
+#else
+# define FLINT_MPN_MULLOW_FUNC_TAB_WIDTH 0
+#endif
+
+/* TODO: Fix higher stuff */
+MPN_EXTRAS_INLINE
+mp_limb_t flint_mpn_mullow_n(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+    FLINT_ASSERT(n >= 1);
+
+    if (FLINT_HAVE_MULLOW_FUNC(n))
+    {
+        FLINT_ASSERT(rp != xp);
+        return flint_mpn_mullow_func_tab[n](rp, xp, yp);
+    }
+    else
+        return flint_mpn_mullow_basecase(rp, xp, yp, n);
+}
+
 /* High multiplication *******************************************************/
 
 #define FLINT_HAVE_MULHIGH_FUNC(n) ((n) <= FLINT_MPN_MULHIGH_FUNC_TAB_WIDTH)
diff --git a/src/mpn_extras/mullow.c b/src/mpn_extras/mullow.c
new file mode 100644
index 0000000000..266dfd0a44
--- /dev/null
+++ b/src/mpn_extras/mullow.c
@@ -0,0 +1,57 @@
+/*
+    Copyright (C) 2024 Albin Ahlbäck
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "mpn_extras.h"
+
+#if FLINT_HAVE_ASSEMBLY_x86_64_adx
+mp_limb_t flint_mpn_mullow_1(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_2(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_3(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_4(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_5(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_6(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_7(mp_ptr, mp_srcptr, mp_srcptr);
+mp_limb_t flint_mpn_mullow_8(mp_ptr, mp_srcptr, mp_srcptr);
+
+const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[] =
+{
+    NULL,
+    flint_mpn_mullow_1,
+    flint_mpn_mullow_2,
+    flint_mpn_mullow_3,
+    flint_mpn_mullow_4,
+    flint_mpn_mullow_5,
+    flint_mpn_mullow_6,
+    flint_mpn_mullow_7,
+    flint_mpn_mullow_8
+};
+#else
+const flint_mpn_mul_func_t flint_mpn_mullow_func_tab[] = { NULL };
+#endif
+
+#if !FLINT_HAVE_NATIVE_mpn_mullow_basecase
+mp_limb_t
+flint_mpn_mullow_basecase(mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
+{
+    mp_limb_t ret;
+    mp_size_t ix;
+
+    ret = mpn_mul_1(rp, xp, n, yp[0]);
+
+    for (ix = 1; ix < n; ix++)
+    {
+        ret += mpn_addmul_1(rp + ix, xp, n - ix, yp[ix]);
+        ret += xp[n - ix] * yp[ix];
+    }
+
+    return ret;
+}
+#endif
diff --git a/src/mpn_extras/profile/p-mullow.c b/src/mpn_extras/profile/p-mullow.c
new file mode 100644
index 0000000000..6a82befd61
--- /dev/null
+++ b/src/mpn_extras/profile/p-mullow.c
@@ -0,0 +1,74 @@
+/*
+    Copyright (C) 2024 Albin Ahlbäck
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "mpn_extras.h"
+#include "profiler.h"
+
+#define mpn_mullo_basecase __gmpn_mullo_basecase
+void mpn_mullo_basecase(mp_ptr, mp_srcptr, mp_srcptr, mp_size_t);
+
+#define N_MIN 1
+#define N_MAX 30
+
+#if N_MAX < 9
+# define P 1
+#elif N_MAX < 99
+# define P 2
+#elif N_MAX < 999
+# define P 3
+#elif N_MAX < 9999
+# define P 4
+#else
+# define P 5
+#endif
+
+#define _STR(x) #x
+#define STR(x) _STR(x)
+
+int
+main(void)
+{
+    mp_limb_t rp[2 * N_MAX];
+    mp_limb_t xp[N_MAX];
+    mp_limb_t yp[N_MAX];
+    mp_size_t n;
+
+    flint_printf("%.*s      mullo_basecase / FLINT || mul / mullow\n", P, "                              ");
+    for (n = N_MIN; n <= N_MAX; n++)
+    {
+        double t1, t2, t3, FLINT_SET_BUT_UNUSED(__);
+        flint_printf("n = %" STR(P) "wd:", n);
+
+        mpn_random2(xp, n);
+        mpn_random2(yp, n);
+
+        TIMEIT_START
+        mpn_mullo_basecase(rp, xp, yp, n);
+        TIMEIT_STOP_VALUES(__, t1)
+
+        TIMEIT_START
+        flint_mpn_mul_n(rp, xp, yp, n);
+        TIMEIT_STOP_VALUES(__, t2)
+
+        TIMEIT_START
+        flint_mpn_mullow_n(rp, xp, yp, n);
+        TIMEIT_STOP_VALUES(__, t3)
+
+        flint_printf("         %7.2fx       ||  %7.2f\n", t1 / t3, t2 / t3);
+    }
+
+    flint_cleanup_master();
+
+    return 0;
+}
+
+#undef N_MIN
+#undef N_MAX
diff --git a/src/mpn_extras/test/main.c b/src/mpn_extras/test/main.c
index 0d8ab7c6ac..e28b927bd1 100644
--- a/src/mpn_extras/test/main.c
+++ b/src/mpn_extras/test/main.c
@@ -24,6 +24,7 @@
 #include "t-mul.c"
 #include "t-mul_n.c"
 #include "t-mul_toom22.c"
+#include "t-mullow_n.c"
 #include "t-mulhigh.c"
 #include "t-mulhigh_normalised.c"
 #include "t-mulmod_2expp1.c"
@@ -48,6 +49,7 @@ test_struct tests[] =
     TEST_FUNCTION(flint_mpn_mul),
     TEST_FUNCTION(flint_mpn_mul_n),
     TEST_FUNCTION(flint_mpn_mul_toom22),
+    TEST_FUNCTION(flint_mpn_mullow_n),
     TEST_FUNCTION(flint_mpn_mulhigh),
     TEST_FUNCTION(flint_mpn_mulhigh_normalised),
     TEST_FUNCTION(flint_mpn_mulmod_2expp1),
diff --git a/src/mpn_extras/test/t-mullow_n.c b/src/mpn_extras/test/t-mullow_n.c
new file mode 100644
index 0000000000..388fd330fb
--- /dev/null
+++ b/src/mpn_extras/test/t-mullow_n.c
@@ -0,0 +1,69 @@
+/*
+    Copyright (C) 2024 Albin Ahlbäck
+
+    This file is part of FLINT.
+
+    FLINT is free software: you can redistribute it and/or modify it under
+    the terms of the GNU Lesser General Public License (LGPL) as published
+    by the Free Software Foundation; either version 3 of the License, or
+    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+*/
+
+#include "test_helpers.h"
+#include "mpn_extras.h"
+
+#define N_MIN 1
+#define N_MAX 30
+
+TEST_FUNCTION_START(flint_mpn_mullow_n, state)
+{
+    slong ix;
+    int result;
+
+    mp_ptr rp, rpf, xp, yp;
+
+    for (ix = 0; ix < 100000 * flint_test_multiplier(); ix++)
+    {
+        mp_limb_t ret;
+        mp_size_t n;
+
+        n = N_MIN + n_randint(state, N_MAX - N_MIN + 1);
+
+        rp = flint_malloc(sizeof(mp_limb_t) * n);
+        rpf = flint_malloc(2 * sizeof(mp_limb_t) * n);
+        xp = flint_malloc(sizeof(mp_limb_t) * n);
+        yp = flint_malloc(sizeof(mp_limb_t) * n);
+
+        mpn_random2(xp, n);
+        mpn_random2(yp, n);
+
+        if (ix < 1)
+            continue;
+
+        ret = flint_mpn_mullow_n(rp, xp, yp, n);
+        flint_mpn_mul_n(rpf, xp, yp, n);
+
+        result = (mpn_cmp(rp, rpf, n) == 0 && ret == rpf[n]);
+        if (!result)
+            TEST_FUNCTION_FAIL(
+                    "ix = %wd\n"
+                    "n = %wd\n"
+                    "xp = %{ulong*}\n"
+                    "yp = %{ulong*}\n"
+                    "Exp ret: %{ulong}\n"
+                    "Got ret: %{ulong}\n"
+                    "Expected: %{ulong*}\n"
+                    "Got:      %{ulong*}\n",
+                    ix, n, xp, n, yp, n, rpf[n], ret, rpf, n, rp, n);
+
+        flint_free(rp);
+        flint_free(rpf);
+        flint_free(xp);
+        flint_free(yp);
+    }
+
+    TEST_FUNCTION_END(state);
+}
+
+#undef N_MIN
+#undef N_MAX
diff --git a/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm b/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm
new file mode 100644
index 0000000000..3fa98bee51
--- /dev/null
+++ b/src/mpn_extras/x86_64/broadwell/mullow_basecase.asm
@@ -0,0 +1,484 @@
+dnl
+dnl Copyright 2017 Free Software Foundation, Inc.
+dnl Contributed to the GNU project by Torbjorn Granlund.
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+dnl
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`config.m4')
+
+define(`rp',	   `%rdi')
+define(`ap',	   `%rsi')
+define(`bp_param', `%rdx')
+define(`n',	   `%rcx')
+
+define(`bp',	`%r11')
+
+define(`jmpreg',`%r15')
+define(`nn',    `%rbp')
+define(`mm',    `%rbx')
+
+define(`s0', `%r8')
+define(`s1', `%r9')
+define(`s2', `%r10')
+define(`s3', `%r12')
+define(`s4', `%r13')
+define(`s5', `%r14')
+
+define(`sx', `%rax')
+
+dnl Scheme:
+dnl   0 1 2 3 4 5 6 7 8
+dnl 0 x x x x x x x x x
+dnl 1 x x x x x x x x l
+dnl 2 x x x x x x x l
+dnl 3 x x x x x x l
+dnl 4 x x x x x l
+dnl 5 x x x x l
+dnl 6 x x x l
+dnl 7 x x l
+dnl 8 x l
+
+dnl NOTE: Requires n > 8.
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(flint_mpn_mullow_basecase)
+	push	mm
+	push	nn
+	push	s3
+	push	s4
+	push	s5
+	push	jmpreg
+
+	lea	-2(n), R32(nn)
+	lea	1*8(bp_param), bp	C Prepare bp for addmul_1
+	mov	0*8(bp_param), %rdx	C Load rdx for mul_1
+
+	mov	R32(n), R32(sx)
+	shr	$3, R32(n)
+	and	$7, R32(sx)		C clear OF, CF as side-effect
+	lea	L(mtab)(%rip), s0
+ifdef(`PIC',
+`	movslq	(s0,sx,4), sx
+	lea	(sx,s0), s0
+	jmp	*s0
+',`
+	jmp	*(s0,sx,8)
+')
+
+L(mf0):	mulx	0*8(ap), s0, s2
+	lea	7*8(ap), ap
+	lea	-1*8(rp), rp
+	lea	L(f0)(%rip), jmpreg
+	jmp	L(mb0)
+
+L(mf3):	mulx	0*8(ap), s1, sx
+	lea	2*8(ap), ap
+	lea	2*8(rp), rp
+	inc	R32(n)
+	lea	L(f3)(%rip), jmpreg
+	jmp	L(mb3)
+
+L(mf4):	mulx	0*8(ap), s0, s2
+	lea	3*8(ap), ap
+	lea	3*8(rp), rp
+	inc	R32(n)
+	lea	L(f4)(%rip), jmpreg
+	jmp	L(mb4)
+
+L(mf5):	mulx	0*8(ap), s1, sx
+	lea	4*8(ap), ap
+	lea	4*8(rp), rp
+	inc	R32(n)
+	lea	L(f5)(%rip), jmpreg
+	jmp	L(mb5)
+
+L(mf6):	mulx	0*8(ap), s0, s2
+	lea	5*8(ap), ap
+	lea	5*8(rp), rp
+	inc	R32(n)
+	lea	L(f6)(%rip), jmpreg
+	jmp	L(mb6)
+
+L(mf7):	mulx	0*8(ap), s1, sx
+	lea	6*8(ap), ap
+	lea	6*8(rp), rp
+	inc	R32(n)
+	lea	L(f7)(%rip), jmpreg
+	jmp	L(mb7)
+
+L(mf1):	mulx	0*8(ap), s1, sx
+	lea	L(f1)(%rip), jmpreg
+	jmp	L(mb1)
+
+L(mf2):	mulx	0*8(ap), s0, s2
+	lea	1*8(ap), ap
+	lea	1*8(rp), rp
+	lea	L(f2)(%rip), jmpreg
+	mulx	0*8(ap), s1, sx
+
+	ALIGN(32)
+L(mtop):mov	s0, -1*8(rp)
+	adc	s2, s1
+L(mb1):	mulx	1*8(ap), s0, s2
+	adc	sx, s0
+	lea	8*8(ap), ap
+	mov	s1, 0*8(rp)
+L(mb0):	mov	s0, 1*8(rp)
+	mulx	-6*8(ap), s1, sx
+	lea	8*8(rp), rp
+	adc	s2, s1
+L(mb7):	mulx	-5*8(ap), s0, s2
+	mov	s1, -6*8(rp)
+	adc	sx, s0
+L(mb6):	mov	s0, -5*8(rp)
+	mulx	-4*8(ap), s1, sx
+	adc	s2, s1
+L(mb5):	mulx	-3*8(ap), s0, s2
+	mov	s1, -4*8(rp)
+	adc	sx, s0
+L(mb4):	mulx	-2*8(ap), s1, sx
+	mov	s0, -3*8(rp)
+	adc	s2, s1
+L(mb3):	mulx	-1*8(ap), s0, s2
+	adc	sx, s0
+	mov	s1, -2*8(rp)
+	dec	R32(n)
+	mulx	0*8(ap), s1, sx
+	jnz	L(mtop)
+
+	lea	1*8(,nn,8), R32(mm)
+	mov	s0, -1*8(rp)
+	adc	s2, s1
+	mov	0*8(bp), %rdx
+	mov	s1, 0*8(rp)
+	adc	n, sx			C n = 0
+	shr	$3, R32(nn)
+	neg	mm
+	or	R32(nn), R32(n)		C Reset n, clear OF and CF
+	lea	1*8(rp,mm), rp		C Reset rp
+	lea	(ap,mm), ap		C Reset ap
+	jmp	*jmpreg
+
+	nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;
+L(f7):	mulx	0*8(ap), s0, s2
+	lea	5*8(ap), ap
+	lea	-3*8(rp), rp
+	lea	L(f6)(%rip), jmpreg
+	jmp	L(b7)
+
+L(f6):	mulx	0*8(ap), s1, s3
+	lea	4*8(ap), ap
+	lea	-4*8(rp), rp
+	lea	L(f5)(%rip), jmpreg
+	jmp	L(b6)
+
+L(end):	adox	0*8(rp), s1
+	mulx	1*8(ap), s0, s2		C Only s0 is used
+	lea	1*8(mm), mm
+	adox	s3, sx
+	mov	s1, 0*8(rp)
+	lea	1*8(bp), bp
+	adc	s0, sx
+	lea	(ap,mm), ap		C Reset ap
+	mov	0*8(bp), %rdx
+	or	R32(nn), R32(n)		C Reset count, clear CF and OF
+	lea	1*8(rp,mm), rp		C Reset rp
+	jmp	*jmpreg
+
+L(f0):	mulx	0*8(ap), s1, s3
+	lea	-2*8(ap), ap
+	lea	-2*8(rp), rp
+	lea	L(f7)(%rip), jmpreg
+	jmp	L(b0)
+
+L(f3):	mulx	0*8(ap), s0, s2
+	lea	1*8(ap), ap
+	lea	1*8(rp), rp
+	mulx	0*8(ap), s1, s3
+	lea	L(f2)(%rip), jmpreg
+
+	ALIGN(32)
+L(top):	adox	-1*8(rp), s0
+	adcx	s2, s1
+	mov	s0, -1*8(rp)
+	jrcxz	L(end)
+L(b2):	mulx	1*8(ap), s0, s2
+	adox	0*8(rp), s1
+	lea	-1(n), R32(n)
+	mov	s1, 0*8(rp)
+	adcx	s3, s0
+L(b1):	mulx	2*8(ap), s1, s3
+	adcx	s2, s1
+	adox	1*8(rp), s0
+	mov	s0, 1*8(rp)
+L(b0):	mulx	3*8(ap), s0, s2
+	lea	8*8(ap), ap
+	adcx	s3, s0
+	adox	2*8(rp), s1
+	mov	s1, 2*8(rp)
+L(b7):	mulx	-4*8(ap), s1, s3
+	adox	3*8(rp), s0
+	adcx	s2, s1
+	mov	s0, 3*8(rp)
+L(b6):	mulx	-3*8(ap), s0, s2
+	adcx	s3, s0
+	adox	4*8(rp), s1
+	mov	s1, 4*8(rp)
+L(b5):	mulx	-2*8(ap), s1, s3
+	adox	5*8(rp), s0
+	adcx	s2, s1
+	mov	s0, 5*8(rp)
+L(b4):	adox	6*8(rp), s1
+	mulx	-1*8(ap), s0, s2
+	mov	s1, 6*8(rp)
+	lea	8*8(rp), rp
+	adcx	s3, s0
+	mulx	0*8(ap), s1, s3
+	jmp	L(top)
+
+L(f5):	mulx	0*8(ap), s0, s2
+	lea	3*8(ap), ap
+	lea	-5*8(rp), rp
+	lea	L(f4)(%rip), jmpreg
+	jmp	L(b5)
+
+L(f4):	mulx	0*8(ap), s1, s3
+	lea	2*8(ap), ap
+	lea	-6*8(rp), rp
+	lea	L(f3)(%rip), jmpreg
+	jmp	L(b4)
+
+L(f2):	mulx	0*8(ap), s1, s3
+	lea	-1(nn), R32(nn)
+	lea	L(f1)(%rip), jmpreg
+	jmp	L(b2)
+
+L(f1):	mulx	0*8(ap), s0, s2
+	jrcxz	L(cor)
+	lea	-1*8(ap), ap
+	lea	-1*8(rp), rp
+	lea	L(f0)(%rip), jmpreg
+	jmp	L(b1)
+
+define(`t0', `s0')
+define(`t1', `s2')
+define(`t2', `s1')
+define(`t3', `s3')
+define(`t4', `s4')
+define(`t5', `s5')
+define(`t6', `jmpreg')
+define(`t7', `mm')
+define(`t8', `nn')
+define(`t9', `n')
+define(`tx', `sx')
+L(cor):	mulx	1*8(ap), t2, t3
+	mulx	2*8(ap), t4, t5
+	adcx	0*8(rp), t0
+	adox	t1, t2
+	mulx	3*8(ap), t6, t7
+	adcx	1*8(rp), t2
+	adox	t3, t4
+	mulx	4*8(ap), t8, t9
+	adcx	2*8(rp), t4
+	adox	t5, t6
+	mulx	5*8(ap), t1, t3
+	adcx	3*8(rp), t6
+	adox	t7, t8
+	mulx	6*8(ap), t5, t7
+	mov	t0, 0*8(rp)
+	adcx	4*8(rp), t8
+	adox	t9, t1
+	mulx	7*8(ap), t0, t9
+	adcx	5*8(rp), t1
+	adox	t3, t5
+	mulx	8*8(ap), t3, %rdx	C %rdx unused
+	adcx	6*8(rp), t5
+	adox	t7, t0
+	adcx	7*8(rp), t0
+	adox	t9, tx
+	C 2, 4, 6, 8, 1, 5, 0, x
+
+	mov	1*8(bp), %rdx
+	mulx	0*8(ap), t7, t9
+	adc	t3, tx
+	test	%al, %al		C Reset OF and CF
+	adcx	t7, t2
+	adox	t9, t4
+	mov	t2, 1*8(rp)
+	mulx	1*8(ap), t7, t9
+	mulx	2*8(ap), t2, t3
+	adcx	t7, t4
+	adox	t9, t6
+	mulx	3*8(ap), t7, t9
+	adcx	t2, t6
+	adox	t3, t8
+	mulx	4*8(ap), t2, t3
+	adcx	t7, t8
+	adox	t9, t1
+	mulx	5*8(ap), t7, t9
+	adcx	t2, t1
+	adox	t3, t5
+	mulx	6*8(ap), t2, t3
+	adcx	t7, t5
+	adox	t9, t0
+	mulx	7*8(ap), t7, t9		C t9 unused
+	adcx	t2, t0
+	adox	t3, tx
+	C 4, 6, 8, 1, 5, 0, x
+
+	mov	2*8(bp), %rdx
+	mulx	0*8(ap), t2, t3
+	adc	t7, tx
+	test	%al, %al
+	mulx	1*8(ap), t7, t9
+	adcx	t2, t4
+	adox	t3, t6
+	mov	t4, 2*8(rp)
+	mulx	2*8(ap), t2, t3
+	adcx	t7, t6
+	adox	t9, t8
+	mulx	3*8(ap), t7, t9
+	adcx	t2, t8
+	adox	t3, t1
+	mulx	4*8(ap), t2, t3
+	adcx	t7, t1
+	adox	t9, t5
+	mulx	5*8(ap), t7, t9
+	adcx	t2, t5
+	adox	t3, t0
+	mulx	6*8(ap), t2, t3		C t3 unused
+	adcx	t7, t0
+	adox	t9, tx
+	C 6, 8, 1, 5, 0, x
+
+	mov	3*8(bp), %rdx
+	mulx	0*8(ap), t4, t7
+	adc	t2, tx
+	test	%al, %al
+	mulx	1*8(ap), t2, t3
+	adcx	t4, t6
+	adox	t7, t8
+	mov	t6, 3*8(rp)
+	mulx	2*8(ap), t4, t7
+	mulx	3*8(ap), t6, t9
+	adcx	t2, t8
+	adox	t3, t1
+	mulx	4*8(ap), t2, t3
+	adcx	t4, t1
+	adox	t7, t5
+	mulx	5*8(ap), t4, t7		C t7 unused
+	adcx	t6, t5
+	adox	t9, t0
+	adcx	t2, t0
+	adox	t3, tx
+	C 8, 1, 5, 0, x
+
+	mov	4*8(bp), %rdx
+	mulx	0*8(ap), t2, t3
+	adc	t4, tx
+	test	%al, %al
+	mulx	1*8(ap), t4, t6
+	mulx	2*8(ap), t7, t9
+	adcx	t2, t8
+	adox	t3, t1
+	mulx	3*8(ap), t2, t3
+	adcx	t4, t1
+	adox	t6, t5
+	mulx	4*8(ap), t4, t6		C t6 unused
+	adcx	t7, t5
+	adox	t9, t0
+	mov	t8, 4*8(rp)
+	adox	t3, tx
+	adc	t2, t0
+	C 1, 5, 0, x
+
+	mov	5*8(bp), %rdx
+	mulx	0*8(ap), t2, t3
+	adc	t4, tx
+	mulx	1*8(ap), t4, t6
+	mulx	2*8(ap), t7, t8
+	imul	3*8(ap), %rdx
+	add	t2, t1
+	adc	t3, t5
+	mov	t1, 5*8(rp)
+	adc	t6, t0
+	adc	t8, tx
+	add	t4, t5
+	adc	t7, t0
+	adc	%rdx, tx
+	C 5, 0, x
+
+	mov	6*8(bp), %rdx
+	mulx	0*8(ap), t1, t2
+	mulx	1*8(ap), t3, t4
+	imul	2*8(ap), %rdx
+	add	t1, t5
+	adc	t2, t0
+	mov	t5, 6*8(rp)
+	adc	t4, tx
+	add	t3, t0
+	adc	%rdx, tx
+	C 0, x
+
+	mov	7*8(bp), %rdx
+	mulx	0*8(ap), t1, t2
+	imul	1*8(ap), %rdx
+
+	pop	jmpreg
+	pop	s5
+	pop	s4
+	pop	s3
+	pop	nn
+	pop	mm
+
+	add	t1, t0
+	adc	t2, tx
+	mov	t0, 7*8(rp)
+	add	%rdx, tx
+
+	ret
+EPILOGUE()
+	JUMPTABSECT
+	ALIGN(8)
+L(mtab):JMPENT(	L(mf0), L(mtab))
+	JMPENT(	L(mf1), L(mtab))
+	JMPENT(	L(mf2), L(mtab))
+	JMPENT(	L(mf3), L(mtab))
+	JMPENT(	L(mf4), L(mtab))
+	JMPENT(	L(mf5), L(mtab))
+	JMPENT(	L(mf6), L(mtab))
+	JMPENT(	L(mf7), L(mtab))
diff --git a/src/mpn_extras/x86_64/broadwell/mullow_hard.asm b/src/mpn_extras/x86_64/broadwell/mullow_hard.asm
new file mode 100644
index 0000000000..24d0e92693
--- /dev/null
+++ b/src/mpn_extras/x86_64/broadwell/mullow_hard.asm
@@ -0,0 +1,744 @@
+dnl
+dnl Copyright (C) 2024 Albin Ahlbäck
+dnl
+dnl This file is part of FLINT.
+dnl
+dnl FLINT is free software: you can redistribute it and/or modify it under
+dnl the terms of the GNU Lesser General Public License (LGPL) as published
+dnl by the Free Software Foundation; either version 3 of the License, or
+dnl (at your option) any later version.  See <https://www.gnu.org/licenses/>.
+dnl
+
+include(`config.m4')
+
+define(`rp',       `%rdi')
+define(`ap',       `%rsi')
+define(`bp_param', `%rdx')
+
+define(`bp',	   `%r8')
+
+define(`s0', `%rcx')
+define(`s1', `%r9')
+define(`s2', `%r10')
+define(`s3', `%r11')
+define(`s4', `%rbx')
+define(`s5', `%rbp')
+define(`s6', `%r12')
+define(`s7', `%r13')
+define(`s8', `%r14')
+define(`s9', `%r15')
+
+define(`sx', `%rax')
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_mullow_1)
+	mov	0*8(bp_param), %rdx
+	mulx	0*8(ap), s0, sx		C a0 b0
+	mov	s0, 0*8(rp)
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_mullow_2)
+	mov	1*8(bp_param), sx
+	mov	0*8(bp_param), %rdx
+
+	mulx	0*8(ap), s0, s1		C a0 b0
+	mulx	1*8(ap), s2, s3		C a1 b0
+	mov	sx, %rdx
+	mulx	0*8(ap), bp_param, bp	C a0 b1
+	imul	1*8(ap), sx		C L(a1 b1)
+	C s0, (s1, s2, bp_param), (s3, bp, sx)
+
+	mov	s0, 0*8(rp)
+
+	add	s2, s1
+	adc	s3, bp
+	C (s1, bp_param), (bp, sx)
+
+	add	bp_param, s1
+	adc	bp, sx
+	C s1, sx
+
+	mov	s1, 1*8(rp)
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_mullow_3)
+	push	s4
+	push	s5
+
+	mov	bp_param, bp
+
+	mov	0*8(bp_param), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b0
+	mulx	1*8(ap), s2, s3		C   a1 b0
+	mulx	2*8(ap), s4, sx		C   a2 b0
+	C 0, (1, 2), (3, 4), x
+
+	mov	s0, 0*8(rp)
+	add	s2, s1
+
+	mov	1*8(bp), %rdx
+	mulx	0*8(ap), s0, s2		C   a0 b1
+	adc	s4, s3
+	adc	$0, sx
+	mulx	1*8(ap), s4, s5		C   a1 b1
+	imul	2*8(ap), %rdx		C L(a2 b1)
+	C 0, (2, 4), (5, rdx)
+
+	add	s0, s1
+	adc	s2, s3
+	mov	s1, 1*8(rp)
+	adc	%rdx, sx
+
+	mov	2*8(bp), %rdx
+	mulx	0*8(ap), s0, s2		C   a0 b2
+	imul	1*8(ap), %rdx		C L(a1 b2)
+	C 0, (2, rdx)
+
+	add	s4, s3
+	adc	s5, sx
+
+	pop	s5
+	pop	s4
+
+	add	s2, sx
+	add	s0, s3
+	adc	%rdx, sx
+	mov	s3, 2*8(rp)
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_mullow_4)
+	mov	bp_param, bp
+
+	push	s4
+	push	s5
+	push	s6
+	push	s7
+	push	s8
+
+	mov	0*8(bp_param), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b0
+	mulx	1*8(ap), s2, s3		C   a1 b0
+	mulx	2*8(ap), s4, s5		C   a2 b0
+	mulx	3*8(ap), s6, sx		C   a3 b0
+	C 0, (1, 2), (3, 4), (5, 6), x
+
+	mov	s0, 0*8(rp)
+	add	s2, s1
+	adc	s4, s3
+	adc	s6, s5
+	adc	$0, sx
+	C 1, 3, 5, x
+
+	mov	1*8(bp), %rdx
+	mulx	0*8(ap), s0, s2		C   a0 b1
+	mulx	1*8(ap), s4, s6		C   a1 b1
+	mulx	2*8(ap), s7, s8		C   a2 b1
+	imul	3*8(ap), %rdx		C L(a3 b1)
+	C 0, (2, 4), (6, 7), (8, rdx)
+
+	add	s0, s1
+	adc	s2, s3
+	adc	s6, s5
+	adc	s8, sx
+	mov	s1, 1*8(rp)
+	add	s4, s3
+	adc	s7, s5
+	adc	%rdx, sx
+	C 3, 5, x
+
+	mov	2*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b2
+	mulx	1*8(ap), s2, s4		C   a1 b2
+	imul	2*8(ap), %rdx		C L(a2 b2)
+	C 0, (1, 2), (4, rdx)
+
+	add	s0, s3
+	adc	s1, s5
+	mov	s3, 2*8(rp)
+	adc	s4, sx
+	add	s5, s2
+	adc	%rdx, sx
+	C 2, x
+
+	mov	3*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b3
+	imul	1*8(ap), %rdx		C L(a1 b3)
+	C 0, (1, rdx)
+
+	pop	s8
+	pop	s7
+	pop	s6
+	pop	s5
+	pop	s4
+
+	add	s0, s2
+	adc	s1, sx
+	mov	s2, 3*8(rp)
+	add	%rdx, sx
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_mullow_5)
+	mov	bp_param, bp
+
+	push	s4
+	push	s5
+	push	s6
+	push	s7
+	push	s8
+	push	s9
+
+	mov	0*8(bp_param), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b0
+	mulx	1*8(ap), s2, s3		C   a1 b0
+	mulx	2*8(ap), s4, s5		C   a2 b0
+	mulx	3*8(ap), s6, s7		C   a3 b0
+	mulx	4*8(ap), s8, sx		C   a4 b0
+	C 0, (1, 2), (3, 4), (5, 6), (7, 8), x
+
+	mov	s0, 0*8(rp)
+	add	s2, s1
+	adc	s4, s3
+	adc	s6, s5
+	adc	s8, s7
+	adc	$0, sx
+	C 1, 3, 5, 7, x
+
+	mov	1*8(bp), %rdx
+	mulx	0*8(ap), s0, s2		C   a0 b1
+	mulx	1*8(ap), s4, s6		C   a1 b1
+	mulx	2*8(ap), s8, s9		C   a2 b1
+	add	s0, s1
+	adc	s2, s3
+	mov	s1, 1*8(rp)
+	mulx	3*8(ap), s0, s2		C   a3 b1
+	mulx	4*8(ap), %rdx, s1	C   a4 b1, but we only use %rdx
+	C -, (-, 4), (6, 8), (9, 0), (2, %rdx)
+
+	adc	s6, s5
+	adc	s9, s7
+	adc	s2, sx
+	add	s4, s3
+	adc	s8, s5
+	adc	s0, s7
+	adc	%rdx, sx
+	C 3, 5, 7, x
+
+	mov	2*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b2
+	mulx	1*8(ap), s2, s4		C   a1 b2
+	mulx	2*8(ap), s6, s8		C   a2 b2
+	imul	3*8(ap), %rdx		C L(a3 b2)
+	C 0, (1, 2), (4, 6), (8, rdx)
+
+	add	s0, s3
+	adc	s1, s5
+	adc	s4, s7
+	adc	s8, sx
+	add	s2, s5
+	adc	s6, s7
+	adc	%rdx, sx
+	mov	s3, 2*8(rp)
+	C 5, 7, x
+
+	mov	3*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b3
+	mulx	1*8(ap), s2, s3		C   a1 b3
+	imul	2*8(ap), %rdx		C L(a2 b3)
+	C 0, (1, 2), (3, rdx)
+
+	add	s0, s5
+	adc	s1, s7
+	mov	s5, 3*8(rp)
+	adc	s3, sx
+	add	s7, s2
+	adc	%rdx, sx
+	C 2, x
+
+	mov	4*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b4
+	imul	1*8(ap), %rdx		C L(a1 b4)
+	C 0, (1, rdx)
+
+	pop	s9
+	pop	s8
+	pop	s7
+	pop	s6
+	pop	s5
+	pop	s4
+
+	add	s0, s2
+	adc	s1, sx
+	mov	s2, 4*8(rp)
+	add	%rdx, sx
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_mullow_6)
+	mov	bp_param, bp
+
+	push	s4
+	push	s5
+	push	s6
+	push	s7
+	push	s8
+	push	s9
+
+	mov	0*8(bp_param), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b0
+	mulx	1*8(ap), s2, s3		C   a1 b0
+	mulx	2*8(ap), s4, s5		C   a2 b0
+	mulx	3*8(ap), s6, s7		C   a3 b0
+	mulx	4*8(ap), s8, s9		C   a4 b0
+	mulx	5*8(ap), %rdx, sx	C   a5 b0
+	C 0, (1, 2), (3, 4), (5, 6), (7, 8), (9, rdx), x
+
+	add	s2, s1
+	adc	s4, s3
+	adc	s6, s5
+	mov	s0, 0*8(rp)
+	adc	s8, s7
+	adc	%rdx, s9
+	adc	$0, sx
+	C 1, 3, 5, 7, 9, x
+
+	test	%al, %al
+	mov	1*8(bp), %rdx
+	mulx	0*8(ap), s2, s4		C   a0 b1
+	mulx	1*8(ap), s6, s8		C   a1 b1
+	adcx	s2, s1
+	adox	s4, s3
+	mov	s1, 1*8(rp)
+	mulx	2*8(ap), s2, s4		C   a2 b1
+	adcx	s6, s3
+	adox	s8, s5
+	mulx	3*8(ap), s6, s8		C   a3 b1
+	adcx	s2, s5
+	adox	s4, s7
+	mulx	4*8(ap), s2, s4		C   a4 b1
+	adcx	s6, s7
+	adox	s8, s9
+	mulx	5*8(ap), %rdx, s6	C   a5 b1, but we only use %rdx
+	C -, (-, -), (-, -), (-, -), (-, 2), (4, %rdx)
+
+	adcx	s2, s9
+	adox	s4, sx
+	adc	%rdx, sx
+	C 3, 5, 7, 9, x
+
+	mov	2*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b2
+	mulx	1*8(ap), s2, s4		C   a1 b2
+	mulx	2*8(ap), s6, s8		C   a2 b2
+	add	s0, s3
+	adc	s1, s5
+	mov	s3, 2*8(rp)
+	mulx	3*8(ap), s0, s1		C   a3 b2
+	mulx	4*8(ap), %rdx, s3	C   a4 b2, but we only use %rdx
+	C -, (-, 2), (4, 6), (8, 0), (1, rdx)
+
+	adc	s4, s7
+	adc	s8, s9
+	adc	s1, sx
+	add	s2, s5
+	adc	s6, s7
+	adc	s0, s9
+	adc	%rdx, sx
+	C 5, 7, 9, x
+
+	mov	3*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b3
+	mulx	1*8(ap), s2, s3		C   a1 b3
+	mulx	2*8(ap), s4, s6		C   a2 b3
+	imul	3*8(ap), %rdx		C L(a3 b3)
+	C 0, (1, 2), (3, 4), (6, rdx)
+
+	add	s0, s5
+	adc	s1, s7
+	adc	s3, s9
+	mov	s5, 3*8(rp)
+	adc	s6, sx
+	add	s2, s7
+	adc	s4, s9
+	adc	%rdx, sx
+	C 7, 9, x
+
+	mov	4*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b4
+	mulx	1*8(ap), s2, s3		C   a1 b4
+	imul	2*8(ap), %rdx		C L(a2 b4)
+	C 0, (1, 2), (3, rdx)
+
+	add	s0, s7
+	adc	s1, s9
+	adc	s3, sx
+	mov	s7, 4*8(rp)
+	add	s9, s2
+	adc	%rdx, sx
+	C 2, x
+
+	mov	5*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b5
+	imul	1*8(ap), %rdx		C L(a1 b5)
+	C 0, (1, rdx)
+
+	pop	s9
+	pop	s8
+	pop	s7
+	pop	s6
+	pop	s5
+	pop	s4
+
+	add	s0, s2
+	adc	s1, sx
+	mov	s2, 5*8(rp)
+	add	%rdx, sx
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_mullow_7)
+	mov	bp_param, bp
+
+	push	s4
+	push	s5
+	push	s6
+	push	s7
+	push	s8
+	push	s9
+
+	mov	0*8(bp_param), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b0
+	mulx	1*8(ap), s2, s3		C   a1 b0
+	mulx	2*8(ap), s4, s5		C   a2 b0
+	mulx	3*8(ap), s6, s7		C   a3 b0
+	mov	s0, 0*8(rp)
+	mulx	4*8(ap), s8, s9		C   a4 b0
+	add	s2, s1
+	adc	s4, s3
+	mulx	5*8(ap), s0, s2		C   a5 b0
+	adc	s6, s5
+	adc	s8, s7
+	mulx	6*8(ap), s4, sx		C   a6 b0
+	C 0, (1, -), (3, -), (5, -), (7, -), (9, 0), (2, 4), x
+
+	adc	s0, s9
+	adc	s4, s2
+	mov	1*8(bp), %rdx
+	adc	$0, sx
+	C 1, 3, 5, 7, 9, 2, x
+
+	test	%al, %al
+	mulx	0*8(ap), s0, s4		C   a0 b1
+	mulx	1*8(ap), s6, s8		C   a1 b1
+	adcx	s0, s1
+	adox	s4, s3
+	mulx	2*8(ap), s0, s4		C   a2 b1
+	adcx	s6, s3
+	adox	s8, s5
+	mulx	3*8(ap), s6, s8		C   a3 b1
+	adcx	s0, s5
+	adox	s4, s7
+	mulx	4*8(ap), s0, s4		C   a4 b1
+	adcx	s6, s7
+	adox	s8, s9
+	mulx	5*8(ap), s6, s8		C   a5 b1
+	adcx	s0, s9
+	adox	s4, s2
+	mulx	6*8(ap), %rdx, s0	C   a6 b1, but we only use %rdx
+	adcx	s6, s2
+	adox	s8, sx
+	mov	s1, 1*8(rp)
+	adc	%rdx, sx
+	C 3, 5, 7, 9, 2, x
+
+	mov	2*8(bp), %rdx
+	test	%al, %al
+	mulx	0*8(ap), s0, s1		C   a0 b2
+	mulx	1*8(ap), s4, s6		C   a1 b2
+	adcx	s0, s3
+	adox	s1, s5
+	mov	s3, 2*8(rp)
+	mulx	2*8(ap), s0, s1		C   a2 b2
+	adcx	s4, s5
+	adox	s6, s7
+	mulx	3*8(ap), s4, s6		C   a3 b2
+	adcx	s0, s7
+	adox	s1, s9
+	mulx	4*8(ap), s0, s1		C   a4 b2
+	adcx	s4, s9
+	adox	s6, s2
+	mulx	5*8(ap), %rdx, s3	C   a5 b2, but we only use %rdx
+	adcx	s0, s2
+	adox	s1, sx
+	adc	%rdx, sx
+	C 5, 7, 9, 2, x
+
+	mov	3*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b3
+	mulx	1*8(ap), s3, s4		C   a1 b3
+	mulx	2*8(ap), s6, s8		C   a2 b3
+	add	s0, s5
+	adc	s3, s1
+	adc	$0, s4
+	mulx	3*8(ap), s0, s3		C   a3 b3
+	imul	4*8(ap), %rdx		C L(a4 b3)
+	C -, (1, -), (4, 6), (8, 0), (3, rdx)
+
+	mov	s5, 3*8(rp)
+	add	s1, s7
+	adc	s4, s9
+	adc	s8, s2
+	adc	s3, sx
+	add	s6, s9
+	adc	s0, s2
+	adc	%rdx, sx
+	C 7, 9, 2, x
+
+	mov	4*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b4
+	mulx	1*8(ap), s3, s4		C   a1 b4
+	mulx	2*8(ap), s5, s6		C   a2 b4
+	imul	3*8(ap), %rdx		C L(a3 b4)
+	C 0, (1, 3), (4, 5), (6, rdx)
+
+	add	s0, s7
+	adc	s1, s9
+	adc	s4, s2
+	mov	s7, 4*8(rp)
+	adc	s6, sx
+	add	s9, s3
+	adc	s5, s2
+	adc	%rdx, sx
+	C 3, 2, x
+
+	mov	5*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b5
+	mulx	1*8(ap), s4, s5		C   a1 b5
+	imul	2*8(ap), %rdx		C L(a2 b5)
+	C 0, (1, 4), (5, rdx)
+
+	pop	s9
+	pop	s8
+
+	add	s0, s3
+	adc	s1, s2
+	adc	s5, sx
+	mov	s3, 5*8(rp)
+	add	s4, s2
+	adc	%rdx, sx
+	C 2, x
+
+	mov	6*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b6
+	imul	1*8(ap), %rdx		C L(a1 b6)
+	C 0, (1, rdx)
+
+	pop	s7
+	pop	s6
+	pop	s5
+	pop	s4
+
+	add	s0, s2
+	adc	s1, sx
+	mov	s2, 6*8(rp)
+	add	%rdx, sx
+
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(flint_mpn_mullow_8)
+	mov	bp_param, bp
+
+	push	s4
+	push	s5
+	push	s6
+	push	s7
+	push	s8
+	push	s9
+
+	mov	0*8(bp_param), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b0
+	mulx	1*8(ap), s2, s3		C   a1 b0
+	mulx	2*8(ap), s4, s5		C   a2 b0
+	mulx	3*8(ap), s6, s7		C   a3 b0
+	mov	s0, 0*8(rp)
+	mulx	4*8(ap), s8, s9		C   a4 b0
+	add	s2, s1
+	adc	s4, s3
+	mulx	5*8(ap), s0, s2		C   a5 b0
+	adc	s6, s5
+	adc	s8, s7
+	mulx	6*8(ap), s4, s6		C   a6 b0
+	mulx	7*8(ap), s8, sx		C   a7 b0
+	C -, (1, -), (3, -), (5, -), (7, -), (9, 0), (2, 4), (6, 8), x
+
+	adc	s0, s9
+	adc	s4, s2
+	adc	s8, s6
+	mov	1*8(bp), %rdx
+	adc	$0, sx
+	C 1, 3, 5, 7, 9, 2, 6, x
+
+	test	%al, %al
+	mulx	0*8(ap), s0, s4		C   a0 b1
+	adcx	s0, s1
+	adox	s4, s3
+	mov	s1, 1*8(rp)
+	mulx	1*8(ap), s8, s0		C   a1 b1
+	mulx	2*8(ap), s4, s1		C   a2 b1
+	adcx	s8, s3
+	adox	s0, s5
+	mulx	3*8(ap), s8, s0		C   a3 b1
+	adcx	s4, s5
+	adox	s1, s7
+	mulx	4*8(ap), s4, s1		C   a4 b1
+	adcx	s8, s7
+	adox	s0, s9
+	mulx	5*8(ap), s8, s0		C   a5 b1
+	adcx	s4, s9
+	adox	s1, s2
+	mulx	6*8(ap), s4, s1		C   a6 b1
+	adcx	s8, s2
+	adox	s0, s6
+	mulx	7*8(ap), s8, s0		C   a7 b1, but we only use s8
+	adcx	s4, s6
+	adox	s1, sx
+	adc	s8, sx
+	C 3, 5, 7, 9, 2, 6, x
+
+	mov	2*8(bp), %rdx
+	test	%al, %al
+	mulx	0*8(ap), s0, s1		C   a0 b2
+	mulx	1*8(ap), s4, s8		C   a1 b2
+	adcx	s0, s3
+	adox	s1, s5
+	mov	s3, 2*8(rp)
+	mulx	2*8(ap), s0, s1		C   a2 b2
+	adcx	s4, s5
+	adox	s8, s7
+	mulx	3*8(ap), s4, s8		C   a3 b2
+	adcx	s0, s7
+	adox	s1, s9
+	mulx	4*8(ap), s0, s1		C   a4 b2
+	adcx	s4, s9
+	adox	s8, s2
+	mulx	5*8(ap), s4, s8		C   a5 b2
+	adcx	s0, s2
+	adox	s1, s6
+	mulx	6*8(ap), s0, s1		C   a6 b2, but we only use s0
+	adcx	s4, s6
+	adox	s8, sx
+	adc	s0, sx
+	C 5, 7, 9, 2, 6, x
+
+	mov	3*8(bp), %rdx
+	test	%al, %al
+	mulx	0*8(ap), s0, s1		C   a0 b3
+	mulx	1*8(ap), s3, s4		C   a1 b3
+	adcx	s0, s5
+	adox	s1, s7
+	mulx	2*8(ap), s0, s1		C   a2 b3
+	adcx	s3, s7
+	adox	s4, s9
+	mulx	3*8(ap), s3, s4		C   a3 b3
+	adcx	s0, s9
+	adox	s1, s2
+	mulx	4*8(ap), s0, s1		C   a4 b3
+	adcx	s3, s2
+	adox	s4, s6
+	mulx	5*8(ap), s3, s4		C   a5 b3, but we only use s3
+	mov	s5, 3*8(rp)
+	adcx	s0, s6
+	adox	s1, sx
+	adc	s3, sx
+	C 7, 9, 2, 6, x
+
+	mov	4*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b4
+	mulx	1*8(ap), s3, s4		C   a1 b4
+	mulx	2*8(ap), s5, s8		C   a2 b4
+	add	s0, s7
+	adc	$0, s1
+	mov	s7, 4*8(rp)
+	mulx	3*8(ap), s0, s7		C   a3 b4
+	imul	4*8(ap), %rdx		C L(a4 b4)
+	C -, (1, 3), (4, 5), (8, 0), (7, rdx)
+
+	add	s1, s9
+	adc	s4, s2
+	adc	s8, s6
+	adc	s7, sx
+	add	s3, s9
+	adc	s5, s2
+	adc	s0, s6
+	adc	%rdx, sx
+	C 9, 2, 6, x
+
+	mov	5*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b5
+	mulx	1*8(ap), s5, s4		C   a1 b5
+	mulx	2*8(ap), s3, s7		C   a2 b5
+	imul	3*8(ap), %rdx		C L(a3 b5)
+	C 0, (1, 5), (4, 3), (7, rdx)
+
+	add	s0, s9
+	adc	s1, s2
+	adc	s4, s6
+	adc	s7, sx
+	mov	s9, 5*8(rp)
+	add	s5, s2
+	adc	s6, s3
+	adc	%rdx, sx
+	C 2, 3, x
+
+	mov	6*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b6
+	mulx	1*8(ap), s4, s5		C   a1 b6
+	imul	2*8(ap), %rdx		C L(a2 b6)
+	C 0, (1, 4), (5, rdx)
+
+	pop	s9
+	pop	s8
+
+	add	s0, s2
+	adc	s1, s3
+	adc	s5, sx
+	mov	s2, 6*8(rp)
+	add	s4, s3
+	adc	%rdx, sx
+	C 3, x
+
+	mov	7*8(bp), %rdx
+	mulx	0*8(ap), s0, s1		C   a0 b7
+	imul	1*8(ap), %rdx		C L(a1 b7)
+	C 0, (1, rdx)
+
+	pop	s7
+	pop	s6
+	pop	s5
+	pop	s4
+
+	add	s0, s3
+	adc	s1, sx
+	mov	s3, 7*8(rp)
+	add	%rdx, sx
+
+	ret
+EPILOGUE()