From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001
From: Thomas Voss <mail@thomasvoss.com>
Date: Fri, 21 Jun 2024 23:36:36 +0200
Subject: Basic constant folding implementation

---
 vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm | 153 ++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm        | 195 +++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm       | 217 ++++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm        | 179 +++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm   | 249 +++++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h        | 237 ++++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm    | 469 ++++++++++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm  | 436 +++++++++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm | 559 ++++++++++++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm          | 591 +++++++++++++++
 vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm    | 807 +++++++++++++++++++++
 11 files changed, 4092 insertions(+)
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm
 create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm

(limited to 'vendor/gmp-6.3.0/mpn/x86_64/k8')

diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm
new file mode 100644
index 0000000..3e1898b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm
@@ -0,0 +1,153 @@
+dnl  AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63.
+
+dnl  Copyright 2008, 2021 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.167
+C AMD K10	 2.167
+C Intel P4	12.0
+C Intel core2	 4.0
+C Intel corei	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C TODO
+C  * Perhaps handle various n mod 3 sizes better.  The code now is too large.
+
+C INPUT PARAMETERS
+define(`rp',	`%rdi')
+define(`ap',	`%rsi')
+define(`bp_param', `%rdx')
+define(`n',	`%rcx')
+define(`u0',	`%r8')
+define(`v0',	`%r9')
+
+
+define(`bp', `%rbp')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addaddmul_1msb0)
+        FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	push	%rbp
+
+	lea	(ap,n,8), ap
+	lea	(bp_param,n,8), bp
+	lea	(rp,n,8), rp
+	neg	n
+
+	mov	(ap,n,8), %rax
+	mul	%r8
+	mov	%rax, %r11
+	mov	(bp,n,8), %rax
+	mov	%rdx, %r10
+	add	$3, n
+	jns	L(end)
+
+	push	%r13
+
+	ALIGN(16)
+L(top):	mul	%r9
+	add	%rax, %r11
+	mov	-16(ap,n,8), %rax
+	adc	%rdx, %r10
+	mov	%r11, -24(rp,n,8)
+	mul	%r8
+	add	%rax, %r10
+	mov	-16(bp,n,8), %rax
+	mov	$0, R32(%r13)
+	adc	%rdx, %r13
+	mul	%r9
+	add	%rax, %r10
+	mov	-8(ap,n,8), %rax
+	adc	%rdx, %r13
+	mov	%r10, -16(rp,n,8)
+	mul	%r8
+	add	%rax, %r13
+	mov	-8(bp,n,8), %rax
+	mov	$0, R32(%r11)
+	adc	%rdx, %r11
+	mul	%r9
+	add	%rax, %r13
+	adc	%rdx, %r11
+	mov	(ap,n,8), %rax
+	mul	%r8
+	add	%rax, %r11
+	mov	%r13, -8(rp,n,8)
+	mov	(bp,n,8), %rax
+	mov	$0, R32(%r10)
+	adc	%rdx, %r10
+	add	$3, n
+	js	L(top)
+
+	pop	%r13
+
+L(end):	mul	%r9
+	add	%rax, %r11
+	adc	%rdx, %r10
+	cmp	$1, R32(n)
+	ja	L(two)
+	mov	-16(ap,n,8), %rax
+	mov	%r11, -24(rp,n,8)
+	mov	%r10, %r11
+	jz	L(one)
+
+L(nul):	mul	%r8
+	add	%rax, %r10
+	mov	-16(bp), %rax
+	mov	$0, R32(%r11)
+	adc	%rdx, %r11
+	mul	%r9
+	add	%rax, %r10
+	mov	-8(ap), %rax
+	adc	%rdx, %r11
+	mov	%r10, -16(rp)
+L(one):	mul	%r8
+	add	%rax, %r11
+	mov	-8(bp), %rax
+	mov	$0, R32(%r10)
+	adc	%rdx, %r10
+	mul	%r9
+	add	%rax, %r11
+	adc	%rdx, %r10
+
+L(two):	mov	%r11, -8(rp)
+	mov	%r10, %rax
+L(ret):	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm
new file mode 100644
index 0000000..78bcba1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm
@@ -0,0 +1,195 @@
+dnl  AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and
+dnl  add the result to a third limb vector.
+
+dnl  Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb cfg	cycles/limb am1+am1
+C AMD K8,K9	 2.375
+C AMD K10	 2.375
+C AMD bull	 5.2		<-		4.6-4.75		bad
+C AMD pile	 4.96		<-		4.6-4.75		bad
+C AMD steam	 ?
+C AMD excavator	 ?
+C AMD bobcat	 5.75				5.0			bad
+C AMD jaguar	 5.9				5.2-5.4			bad
+C Intel P4	15-16
+C Intel core2	 4.5				4.25-4.5		bad
+C Intel NHM	 4.33				4.55			bad
+C Intel SBR	 3.4		 2.93		3.24			bad
+C Intel IBR	 3.35		 2.6		2.95			bad
+C Intel HWL	 3.3		 2.15		2.3			bad
+C Intel BWL	 2.33		 2.33		1.65			bad
+C Intel SKL	 2.37		 2.21		1.64			bad
+C Intel atom	20		18.7
+C Intel SLM	 8		 8.5
+C VIA nano	 4.4
+
+C This code is the result of running a code generation and optimization tool
+C suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Tune feed-in and wind-down code.
+
+C INPUT PARAMETERS
+define(`rp',     `%rdi')
+define(`up',     `%rsi')
+define(`n_param',`%rdx')
+define(`vp',     `%rcx')
+
+define(`v0', `%r8')
+define(`v1', `%r9')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+define(`n',  `%r11')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_addmul_2)
+	FUNC_ENTRY(4)
+	mov	n_param, n
+	push	%rbx
+	push	%rbp
+
+	mov	0(vp), v0
+	mov	8(vp), v1
+
+	mov	R32(n_param), R32(%rbx)
+	mov	(up), %rax
+	lea	-8(up,n_param,8), up
+	lea	-8(rp,n_param,8), rp
+	mul	v0
+	neg	n
+	and	$3, R32(%rbx)
+	jz	L(b0)
+	cmp	$2, R32(%rbx)
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	mov	%rax, w1
+	mov	%rdx, w2
+	xor	R32(w3), R32(w3)
+	mov	8(up,n,8), %rax
+	dec	n
+	jmp	L(lo3)
+
+L(b2):	mov	%rax, w2
+	mov	8(up,n,8), %rax
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	add	$-2, n
+	jmp	L(lo2)
+
+L(b1):	mov	%rax, w3
+	mov	8(up,n,8), %rax
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	inc	n
+	jmp	L(lo1)
+
+L(b0):	mov	$0, R32(w3)
+	mov	%rax, w0
+	mov	8(up,n,8), %rax
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	jmp	L(lo0)
+
+	ALIGN(32)
+L(top):	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w3
+	mov	(up,n,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(lo1):	mul	v1
+	add	w3, (rp,n,8)
+	mov	$0, R32(w3)
+	adc	%rax, w0
+	mov	$0, R32(w2)
+	mov	8(up,n,8), %rax
+	adc	%rdx, w1
+	mul	v0
+	add	%rax, w0
+	mov	8(up,n,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+L(lo0):	mul	v1
+	add	w0, 8(rp,n,8)
+	adc	%rax, w1
+	adc	%rdx, w2
+	mov	16(up,n,8), %rax
+	mul	v0
+	add	%rax, w1
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mov	16(up,n,8), %rax
+L(lo3):	mul	v1
+	add	w1, 16(rp,n,8)
+	adc	%rax, w2
+	adc	%rdx, w3
+	xor	R32(w0), R32(w0)
+	mov	24(up,n,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	24(up,n,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+L(lo2):	mul	v1
+	add	w2, 24(rp,n,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+	mov	32(up,n,8), %rax
+	add	$4, n
+	js	L(top)
+
+L(end):	xor	R32(w1), R32(w1)
+	mul	v0
+	add	%rax, w3
+	mov	(up), %rax
+	adc	%rdx, w0
+	adc	R32(w1), R32(w1)
+	mul	v1
+	add	w3, (rp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(rp)
+	mov	w1, %rax
+
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
new file mode 100644
index 0000000..ff3a184
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
@@ -0,0 +1,217 @@
+dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
+
+dnl  Copyright 2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.87	< 3.85 for lshift + add_n
+C AMD K10	 2.75	< 3.85 for lshift + add_n
+C Intel P4	22	> 7.33 for lshift + add_n
+C Intel core2	 4.1	> 3.27 for lshift + add_n
+C Intel NHM	 4.4	> 3.75 for lshift + add_n
+C Intel SBR	 3.17	< 3.46 for lshift + add_n
+C Intel atom	 ?	? 8.75 for lshift + add_n
+C VIA nano	 4.7	< 6.25 for lshift + add_n
+
+C TODO
+C  * Can we propagate carry into rdx instead of using a special carry register?
+C    That could save enough insns to get to 10 cycles/iteration.
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`vp_param', `%rdx')
+define(`n_param',  `%rcx')
+define(`cnt',      `%r8')
+
+define(`vp',    `%r12')
+define(`n',     `%rbp')
+
+ifdef(`OPERATION_addlsh_n',`
+  define(ADDSUB,       `add')
+  define(ADCSBB,       `adc')
+  define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+  define(ADDSUB,       `sub')
+  define(ADCSBB,       `sbb')
+  define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(func)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%r12
+	push	%rbp
+	push	%rbx
+
+	mov	(vp_param), %rax	C load first V limb early
+
+	mov	$0, R32(n)
+	sub	n_param, n
+
+	lea	-16(up,n_param,8), up
+	lea	-16(rp,n_param,8), rp
+	lea	16(vp_param,n_param,8), vp
+
+	mov	n_param, %r9
+
+	mov	%r8, %rcx
+	mov	$1, R32(%r8)
+	shl	R8(%rcx), %r8
+
+	mul	%r8			C initial multiply
+
+	and	$3, R32(%r9)
+	jz	L(b0)
+	cmp	$2, R32(%r9)
+	jc	L(b1)
+	jz	L(b2)
+
+L(b3):	mov	%rax, %r11
+	ADDSUB	16(up,n,8), %r11
+	mov	-8(vp,n,8), %rax
+	sbb	R32(%rcx), R32(%rcx)
+	mov	%rdx, %rbx
+	mul	%r8
+	or	%rax, %rbx
+	mov	(vp,n,8), %rax
+	mov	%rdx, %r9
+	mul	%r8
+	or	%rax, %r9
+	add	$3, n
+	jnz	L(lo3)
+	jmp	L(cj3)
+
+L(b2):	mov	%rax, %rbx
+	mov	-8(vp,n,8), %rax
+	mov	%rdx, %r9
+	mul	%r8
+	or	%rax, %r9
+	add	$2, n
+	jz	L(cj2)
+	mov	%rdx, %r10
+	mov	-16(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r10
+	xor	R32(%rcx), R32(%rcx)	C clear carry register
+	jmp	L(lo2)
+
+L(b1):	mov	%rax, %r9
+	mov	%rdx, %r10
+	add	$1, n
+	jnz	L(gt1)
+	ADDSUB	8(up,n,8), %r9
+	jmp	L(cj1)
+L(gt1):	mov	-16(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r10
+	mov	%rdx, %r11
+	mov	-8(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r11
+	ADDSUB	8(up,n,8), %r9
+	ADCSBB	16(up,n,8), %r10
+	ADCSBB	24(up,n,8), %r11
+	mov	(vp,n,8), %rax
+	sbb	R32(%rcx), R32(%rcx)
+	jmp	L(lo1)
+
+L(b0):	mov	%rax, %r10
+	mov	%rdx, %r11
+	mov	-8(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r11
+	ADDSUB	16(up,n,8), %r10
+	ADCSBB	24(up,n,8), %r11
+	mov	(vp,n,8), %rax
+	sbb	R32(%rcx), R32(%rcx)
+	mov	%rdx, %rbx
+	mul	%r8
+	or	%rax, %rbx
+	mov	8(vp,n,8), %rax
+	add	$4, n
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	mov	%rdx, %r9
+	mul	%r8
+	or	%rax, %r9
+	mov	%r10, -16(rp,n,8)
+L(lo3):	mov	%rdx, %r10
+	mov	-16(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r10
+	mov	%r11, -8(rp,n,8)
+L(lo2):	mov	%rdx, %r11
+	mov	-8(vp,n,8), %rax
+	mul	%r8
+	or	%rax, %r11
+	add	R32(%rcx), R32(%rcx)
+	ADCSBB	(up,n,8), %rbx
+	ADCSBB	8(up,n,8), %r9
+	ADCSBB	16(up,n,8), %r10
+	ADCSBB	24(up,n,8), %r11
+	mov	(vp,n,8), %rax
+	sbb	R32(%rcx), R32(%rcx)
+	mov	%rbx, (rp,n,8)
+L(lo1):	mov	%rdx, %rbx
+	mul	%r8
+	or	%rax, %rbx
+	mov	%r9, 8(rp,n,8)
+L(lo0):	mov	8(vp,n,8), %rax
+	add	$4, n
+	jnz	L(top)
+
+L(end):	mov	%rdx, %r9
+	mul	%r8
+	or	%rax, %r9
+	mov	%r10, -16(rp,n,8)
+L(cj3):	mov	%r11, -8(rp,n,8)
+L(cj2):	add	R32(%rcx), R32(%rcx)
+	ADCSBB	(up,n,8), %rbx
+	ADCSBB	8(up,n,8), %r9
+	mov	%rbx, (rp,n,8)
+L(cj1):	mov	%r9, 8(rp,n,8)
+	mov	%rdx, %rax
+	ADCSBB	$0, %rax
+	pop	%rbx
+	pop	%rbp
+	pop	%r12
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm
new file mode 100644
index 0000000..1172b0d
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm
@@ -0,0 +1,179 @@
+dnl  AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor,
+dnl  returning quotient only.
+
+dnl  Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	    cycles/limb
+C	     norm/unorm
+C AMD K8,K9	10	+
+C AMD K10	10	+
+C AMD bull	13.7	-
+C AMD pile	13.7	+
+C AMD steam
+C AMD excavator
+C AMD bobcat	15	-
+C AMD jaguar	16	-
+C Intel P4	33	=
+C Intel core2	13.25	=
+C Intel NHM	14	=
+C Intel SBR	8.5	-
+C Intel IBR	8.5	-
+C Intel HWL	8	=
+C Intel BWL	8	=
+C Intel SKL	8	=
+C Intel atom	42	--
+C Intel SLM	20.4	--
+C VIA nano
+
+C INPUT PARAMETERS
+define(`rp',		`%rdi')
+define(`up',		`%rsi')
+define(`n',		`%rdx')
+define(`d',		`%rcx')
+define(`di',		`%r8')		C	just mpn_pi1_bdiv_q_1
+define(`ncnt',		`%r9')		C	just mpn_pi1_bdiv_q_1
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+	FUNC_ENTRY(4)
+	push	%rbx
+
+	mov	%rcx, %rax
+	xor	R32(%rcx), R32(%rcx)	C ncnt count
+	mov	%rdx, %r10
+
+	bt	$0, R32(%rax)
+	jnc	L(evn)			C skip bsf unless divisor is even
+
+L(odd):	mov	%rax, %rbx
+	shr	R32(%rax)
+	and	$127, R32(%rax)		C d/2, 7 bits
+
+	LEA(	binvert_limb_table, %rdx)
+
+	movzbl	(%rdx,%rax), R32(%rax)	C inv 8 bits
+
+	mov	%rbx, %r11		C d without twos
+
+	lea	(%rax,%rax), R32(%rdx)	C 2*inv
+	imul	R32(%rax), R32(%rax)	C inv*inv
+	imul	R32(%rbx), R32(%rax)	C inv*inv*d
+	sub	R32(%rax), R32(%rdx)	C inv = 2*inv - inv*inv*d, 16 bits
+
+	lea	(%rdx,%rdx), R32(%rax)	C 2*inv
+	imul	R32(%rdx), R32(%rdx)	C inv*inv
+	imul	R32(%rbx), R32(%rdx)	C inv*inv*d
+	sub	R32(%rdx), R32(%rax)	C inv = 2*inv - inv*inv*d, 32 bits
+
+	lea	(%rax,%rax), %r8	C 2*inv
+	imul	%rax, %rax		C inv*inv
+	imul	%rbx, %rax		C inv*inv*d
+	sub	%rax, %r8		C inv = 2*inv - inv*inv*d, 64 bits
+
+	jmp	L(pi1)
+
+L(evn):	bsf	%rax, %rcx
+	shr	R8(%rcx), %rax
+	jmp	L(odd)
+EPILOGUE()
+
+PROLOGUE(mpn_pi1_bdiv_q_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	push	%rbx
+
+	mov	%rcx, %r11		C d
+	mov	%rdx, %r10		C n
+	mov	%r9, %rcx		C ncnt
+
+L(pi1):	mov	(up), %rax		C up[0]
+
+	dec	%r10
+	jz	L(one)
+
+	mov	8(up), %rdx		C up[1]
+	lea	(up,%r10,8), up		C up end
+	lea	(rp,%r10,8), rp		C rp end
+	neg	%r10			C -n
+
+	shrd	R8(%rcx), %rdx, %rax
+
+	xor	R32(%rbx), R32(%rbx)
+	jmp	L(ent)
+
+	ALIGN(8)
+L(top):
+	C rax	q
+	C rbx	carry bit, 0 or 1
+	C rcx	ncnt
+	C rdx
+	C r10	counter, limbs, negative
+	C r11	d
+
+	mul	%r11			C carry limb in rdx
+	mov	(up,%r10,8), %rax
+	mov	8(up,%r10,8), %r9
+	shrd	R8(%rcx), %r9, %rax
+	nop
+	sub	%rbx, %rax		C apply carry bit
+	setc	R8(%rbx)
+	sub	%rdx, %rax		C apply carry limb
+	adc	$0, R32(%rbx)
+L(ent):	imul	%r8, %rax
+	mov	%rax, (rp,%r10,8)
+	inc	%r10
+	jnz	L(top)
+
+	mul	%r11			C carry limb in rdx
+	mov	(up), %rax		C up high limb
+	shr	R8(%rcx), %rax
+	sub	%rbx, %rax		C apply carry bit
+	sub	%rdx, %rax		C apply carry limb
+	imul	%r8, %rax
+	mov	%rax, (rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(one):	shr	R8(%rcx), %rax
+	imul	%r8, %rax
+	mov	%rax, (rp)
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm
new file mode 100644
index 0000000..86de08c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm
@@ -0,0 +1,249 @@
+dnl  x86-64 mpn_div_qr_1n_pi1
+dnl  -- Divide an mpn number by a normalized single-limb number,
+dnl     using a single-limb inverse.
+
+dnl  Contributed to the GNU project by Niels Möller
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l
+C AMD K8,K9	11
+C AMD K10	11
+C AMD bull	16
+C AMD pile	14.25
+C AMD steam	 ?
+C AMD bobcat	16
+C AMD jaguar	 ?
+C Intel P4	47.5	poor
+C Intel core	28.5	very poor
+C Intel NHM	29	very poor
+C Intel SBR	16	poor
+C Intel IBR	13.5
+C Intel HWL	12
+C Intel BWL	 ?
+C Intel atom	53	very poor
+C VIA nano	19
+
+
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx')	C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8')	C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+	ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	dec	UN_INPUT
+	jnz	L(first)
+
+	C Just a single 2/1 division.
+	C T, U0 are allocated in scratch registers
+	lea	1(U1), T
+	mov	U1, %rax
+	mul	DINV
+	mov	(UP), U0
+	add	U0, %rax
+	adc	T, %rdx
+	mov	%rdx, T
+	imul	D, %rdx
+	sub	%rdx, U0
+	cmp	U0, %rax
+	lea	(U0, D), %rax
+	cmovnc	U0, %rax
+	sbb	$0, T
+	cmp	D, %rax
+	jc	L(single_div_done)
+	sub	D, %rax
+	add	$1, T
+L(single_div_done):
+	mov	T, (QP)
+	FUNC_EXIT()
+	ret
+L(first):
+	C FIXME: Could delay some of these until we enter the loop.
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbx
+	push	%rbp
+
+	mov	D, B2
+	imul	DINV, B2
+	neg	B2
+	mov	B2, B2md
+	sub	D, B2md
+
+	C D not needed until final reduction
+	push	D
+	mov	UN_INPUT, UN	C Clobbers D
+
+	mov	DINV, %rax
+	mul	U1
+	mov	%rax, Q0
+	add	U1, %rdx
+	mov	%rdx, T
+
+	mov	B2, %rax
+	mul	U1
+	mov	-8(UP, UN, 8), U0
+	mov	(UP, UN, 8), U1
+	mov	T, (QP, UN, 8)
+	add	%rax, U0
+	adc	%rdx, U1
+	sbb	U2, U2
+	dec	UN
+	mov	U1, %rax
+	jz	L(final)
+	mov	$0, R32(Q1)
+
+	ALIGN(16)
+
+	C Loop is 28 instructions, 30 K8/K10 decoder slots, should run
+	C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1
+	C is zero, and carry holds an extra copy of U2.
+L(loop):
+	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
+	C Remains to add in B (U1 + c)
+	cmovc	DINV, Q1
+	mov	U2, Q2
+	neg	Q2
+	mul	DINV
+	add	%rdx, Q1
+	adc	$0, Q2
+	add	Q0, Q1
+	mov	%rax, Q0
+	mov	B2, %rax
+	lea	(B2md, U0), T
+	adc	$0, Q2
+
+	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
+	mul	U1
+	and	B2, U2
+	add	U2, U0
+	cmovnc	U0, T
+
+	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
+	adc	U1, Q1
+	mov	-8(UP, UN, 8), U0
+	adc	Q2, 8(QP, UN, 8)
+	jc	L(q_incr)
+L(q_incr_done):
+	add	%rax, U0
+	mov	T, %rax
+	adc	%rdx, %rax
+	mov	Q1, (QP, UN, 8)
+	mov	$0, R32(Q1)
+	sbb	U2, U2
+	dec	UN
+	mov	%rax, U1
+	jnz	L(loop)
+
+L(final):
+	pop	D
+
+	mov	U2, Q1
+	and	D, U2
+	sub	U2, %rax
+	neg	Q1
+
+	mov	%rax, U1
+	sub	D, %rax
+	cmovc	U1, %rax
+	sbb	$-1, Q1
+
+	lea	1(%rax), T
+	mul	DINV
+	add	U0, %rax
+	adc	T, %rdx
+	mov	%rdx, T
+	imul	D, %rdx
+	sub	%rdx, U0
+	cmp	U0, %rax
+	lea	(U0, D), %rax
+	cmovnc	U0, %rax
+	sbb	$0, T
+	cmp	D, %rax
+	jc	L(div_done)
+	sub	D, %rax
+	add	$1, T
+L(div_done):
+	add	T, Q0
+	mov	Q0, (QP)
+	adc	Q1, 8(QP)
+	jnc	L(done)
+L(final_q_incr):
+	addq	$1, 16(QP)
+	lea	8(QP), QP
+	jc	L(final_q_incr)
+
+L(done):
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	FUNC_EXIT()
+	ret
+
+L(q_incr):
+	C U1 is not live, so use it for indexing
+	lea	16(QP, UN, 8), U1
+L(q_incr_loop):
+	addq	$1, (U1)
+	jnc	L(q_incr_done)
+	lea	8(U1), U1
+	jmp	L(q_incr_loop)
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h
new file mode 100644
index 0000000..d87cc3b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h
@@ -0,0 +1,237 @@
+/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#if 0
+#undef mpn_sublsh_n
+#define mpn_sublsh_n(rp,up,vp,n,c)					\
+  (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c))	\
+   : MPN(mpn_sublsh_n)(rp,up,vp,n,c))
+#endif
+
+/* 2500 MHz K8 Brisbane */
+/* FFT tuning limit = 115,768,433 */
+/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        35
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      9
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1_NORM_THRESHOLD              1
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           16
+
+#define DIV_1_VS_MUL_1_PERCENT             309
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD                81
+#define MUL_TOOM44_THRESHOLD               232
+#define MUL_TOOM6H_THRESHOLD               324
+#define MUL_TOOM8H_THRESHOLD               478
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      97
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     153
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     154
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     160
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     226
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 34
+#define SQR_TOOM3_THRESHOLD                114
+#define SQR_TOOM4_THRESHOLD                336
+#define SQR_TOOM6_THRESHOLD                430
+#define SQR_TOOM8_THRESHOLD                  0  /* always */
+
+#define MULMID_TOOM42_THRESHOLD             36
+
+#define MULMOD_BNM1_THRESHOLD               17
+#define SQRMOD_BNM1_THRESHOLD               19
+
+#define MUL_FFT_MODF_THRESHOLD             654  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    654, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     27, 7}, {     14, 6}, {     29, 7}, {     15, 6}, \
+    {     31, 7}, {     29, 8}, {     15, 7}, {     32, 8}, \
+    {     17, 7}, {     37, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 7}, {     44, 8}, {     23, 7}, {     47, 8}, \
+    {     25, 7}, {     51, 8}, {     31, 7}, {     63, 8}, \
+    {     37, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \
+    {     53, 9}, {     27, 8}, {     57, 9}, {     31, 8}, \
+    {     67, 9}, {     35, 8}, {     71, 9}, {     39, 8}, \
+    {     81, 9}, {     43,10}, {     23, 9}, {     55, 8}, \
+    {    111,10}, {     31, 9}, {     71,10}, {     39, 9}, \
+    {     87,10}, {     47, 9}, {     99,10}, {     55, 9}, \
+    {    111,11}, {     31,10}, {     63, 9}, {    131,10}, \
+    {     71, 9}, {    147,10}, {     87,11}, {     47,10}, \
+    {    111,11}, {     63,10}, {    143,11}, {     79,10}, \
+    {    167,11}, {     95,10}, {    199,11}, {    111,12}, \
+    {     63,11}, {    143,10}, {    287,11}, {    159,12}, \
+    {     95,11}, {    191,10}, {    383,11}, {    207,10}, \
+    {    415,13}, {     63,12}, {    127,11}, {    255,10}, \
+    {    511,11}, {    271,10}, {    543,11}, {    287,12}, \
+    {    159,11}, {    319,10}, {    639,11}, {    335,10}, \
+    {    671,11}, {    351,12}, {    191,11}, {    415,12}, \
+    {    223,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    671,12}, {    351,11}, \
+    {    703,13}, {    191,12}, {    383,11}, {    767,12}, \
+    {    415,11}, {    831,12}, {    447,11}, {    895,12}, \
+    {    479,14}, {    127,13}, {    255,12}, {    543,11}, \
+    {   1087,12}, {    575,11}, {   1151,12}, {    607,13}, \
+    {    319,12}, {    735,13}, {    383,12}, {    831,13}, \
+    {    447,12}, {    959,14}, {    255,13}, {    511,12}, \
+    {   1087,13}, {    575,12}, {   1215,13}, {    639,12}, \
+    {   1279,13}, {    703,12}, {   1407,14}, {    383,13}, \
+    {    767,12}, {   1535,13}, {    831,12}, {   1663,13}, \
+    {    959,15}, {    255,14}, {    511,13}, {   1215,14}, \
+    {    639,13}, {   1471,14}, {    767,13}, {   1663,14}, \
+    {    895,13}, {   1855,15}, {    511,14}, {   1023,13}, \
+    {   2047,14}, {   1151,13}, {   2367,14}, {   1407,15}, \
+    {    767,14}, {   1791,16}, {    511,15}, {   1023,14}, \
+    {   2303,15}, {   1279,14}, {   2687,15}, {   1535,14}, \
+    {   3199,15}, {   1791,16}, {   1023,15}, {   2047,14}, \
+    {   4223,15}, {   2303,14}, {   4735,15}, {   2559,16}, \
+    {   1535,15}, {   3071,14}, {   6271,15}, {   3327,17}, \
+    { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+    {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 183
+#define MUL_FFT_THRESHOLD                11520
+
+#define SQR_FFT_MODF_THRESHOLD             540  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    540, 5}, {     21, 6}, {     11, 5}, {     23, 6}, \
+    {     12, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     16, 5}, {     33, 6}, {     29, 7}, {     15, 6}, \
+    {     31, 7}, {     16, 6}, {     33, 7}, {     33, 8}, \
+    {     17, 7}, {     37, 8}, {     19, 7}, {     39, 8}, \
+    {     21, 7}, {     43, 8}, {     23, 7}, {     47, 8}, \
+    {     25, 7}, {     51, 8}, {     29, 9}, {     15, 8}, \
+    {     37, 9}, {     19, 8}, {     43, 9}, {     23, 8}, \
+    {     51, 9}, {     27, 8}, {     55, 9}, {     31, 8}, \
+    {     65, 9}, {     35, 8}, {     71, 9}, {     43,10}, \
+    {     23, 9}, {     55,10}, {     31, 9}, {     71,10}, \
+    {     39, 9}, {     83,10}, {     47, 9}, {     99,10}, \
+    {     55, 9}, {    111,11}, {     31,10}, {     63, 9}, \
+    {    127,10}, {     87,11}, {     47,10}, {    111,12}, \
+    {     31,11}, {     63,10}, {    143,11}, {     79,10}, \
+    {    167,11}, {     95,10}, {    191,11}, {    111,12}, \
+    {     63,11}, {    127, 9}, {    511,11}, {    143,10}, \
+    {    287, 9}, {    575,11}, {    159,12}, {     95,11}, \
+    {    191,10}, {    383, 9}, {    767,11}, {    207,10}, \
+    {    415,13}, {     63,12}, {    127,10}, {    511, 9}, \
+    {   1023,11}, {    271,10}, {    543, 9}, {   1087,11}, \
+    {    287,10}, {    575,12}, {    159,11}, {    319,10}, \
+    {    639,11}, {    335,10}, {    671,11}, {    351,10}, \
+    {    703,12}, {    191,11}, {    383,10}, {    767,11}, \
+    {    415,10}, {    831,12}, {    223,11}, {    447,13}, \
+    {    127,11}, {    511,10}, {   1023,11}, {    543,10}, \
+    {   1087,12}, {    287,11}, {    575,10}, {   1151,11}, \
+    {    607,12}, {    319,11}, {    639,10}, {   1279,11}, \
+    {    671,12}, {    351,11}, {    703,13}, {    191,12}, \
+    {    383,11}, {    767,12}, {    415,11}, {    831,12}, \
+    {    447,11}, {    895,14}, {    127,12}, {    511,11}, \
+    {   1023,12}, {    543,11}, {   1087,12}, {    575,11}, \
+    {   1151,12}, {    607,11}, {   1215,13}, {    319,12}, \
+    {    639,11}, {   1279,12}, {    671,11}, {   1343,12}, \
+    {    703,11}, {   1407,12}, {    735,13}, {    383,12}, \
+    {    767,11}, {   1535,12}, {    831,13}, {    447,12}, \
+    {    959,13}, {    511,12}, {   1087,13}, {    575,12}, \
+    {   1215,13}, {    639,12}, {   1343,13}, {    703,12}, \
+    {   1407,14}, {    383,13}, {    767,12}, {   1535,13}, \
+    {    831,12}, {   1663,13}, {    895,12}, {   1791,13}, \
+    {    959,14}, {    511,13}, {   1215,14}, {    639,13}, \
+    {   1471,14}, {    767,13}, {   1663,14}, {    895,13}, \
+    {   1791,15}, {    511,14}, {   1023,13}, {   2111,14}, \
+    {   1151,13}, {   2303,14}, {   1407,15}, {    767,14}, \
+    {   1791,16}, {    511,15}, {   1023,14}, {   2303,15}, \
+    {   1279,14}, {   2687,15}, {   1535,14}, {   3199,15}, \
+    {   1791,16}, {   1023,15}, {   2047,14}, {   4223,15}, \
+    {   2303,14}, {   4863,15}, {   2559,16}, {   1535,15}, \
+    {   3071,14}, {   6271,15}, {   3327,17}, { 131072,18}, \
+    { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \
+    {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 202
+#define SQR_FFT_THRESHOLD                 7296
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  61
+#define MULLO_MUL_N_THRESHOLD            22239
+#define SQRLO_BASECASE_THRESHOLD             8
+#define SQRLO_DC_THRESHOLD                   0  /* never mpn_sqrlo_basecase */
+#define SQRLO_SQR_THRESHOLD              14281
+
+#define DC_DIV_QR_THRESHOLD                 47
+#define DC_DIVAPPR_Q_THRESHOLD             266
+#define DC_BDIV_QR_THRESHOLD                38
+#define DC_BDIV_Q_THRESHOLD                104
+
+#define INV_MULMOD_BNM1_THRESHOLD           54
+#define INV_NEWTON_THRESHOLD               252
+#define INV_APPR_THRESHOLD                 250
+
+#define BINV_NEWTON_THRESHOLD              258
+#define REDC_1_TO_REDC_2_THRESHOLD          35
+#define REDC_2_TO_REDC_N_THRESHOLD          79
+
+#define MU_DIV_QR_THRESHOLD               2089
+#define MU_DIVAPPR_Q_THRESHOLD            1895
+#define MUPI_DIV_QR_THRESHOLD               99
+#define MU_BDIV_QR_THRESHOLD              1787
+#define MU_BDIV_Q_THRESHOLD               1895
+
+#define POWM_SEC_TABLE  1,16,194,960,2825
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        26
+#define SET_STR_DC_THRESHOLD               248
+#define SET_STR_PRECOMPUTE_THRESHOLD      1747
+
+#define FAC_DSC_THRESHOLD                 1240
+#define FAC_ODD_THRESHOLD                   27
+
+#define MATRIX22_STRASSEN_THRESHOLD         21
+#define HGCD2_DIV1_METHOD                    3  /* 4.10% faster than 5 */
+#define HGCD_THRESHOLD                     141
+#define HGCD_APPR_THRESHOLD                181
+#define HGCD_REDUCE_THRESHOLD             4633
+#define GCD_DC_THRESHOLD                   622
+#define GCDEXT_DC_THRESHOLD                496
+#define JACOBI_BASE_METHOD                   1  /* 0.97% faster than 3 */
+
+/* Tuneup completed successfully, took 131832 seconds */
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm
new file mode 100644
index 0000000..ca2efb9
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm
@@ -0,0 +1,469 @@
+dnl  AMD64 mpn_mul_basecase.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund and David Harvey.
+
+dnl  Copyright 2008, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 2.375
+C AMD K10	 2.375
+C Intel P4	15-16
+C Intel core2	 4.45
+C Intel corei	 4.35
+C Intel atom	 ?
+C VIA nano	 4.5
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Use fewer registers.  (how??? I can't see it -- david)
+C  * Avoid some "mov $0,r" and instead use "xor r,r".
+C  * Can the top of each L(addmul_outer_n) prologue be folded into the
+C    mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
+C    case where vn = 1 or 2; is it worth it?
+
+C INPUT PARAMETERS
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp',      `%rcx')
+define(`vn',      `%r8')
+
+define(`v0', `%r12')
+define(`v1', `%r9')
+
+define(`w0', `%rbx')
+define(`w1', `%r15')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`n',  `%r11')
+define(`outer_addr', `%r14')
+define(`un',  `%r13')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	xor	R32(un), R32(un)
+	mov	(up), %rax
+	mov	(vp), v0
+
+	sub	un_param, un		C rdx used by mul
+	mov	un, n
+	mov	R32(un_param), R32(w0)
+
+	lea	(rp,un_param,8), rp
+	lea	(up,un_param,8), up
+
+	mul	v0
+
+	test	$1, R8(vn)
+	jz	L(mul_2)
+
+C ===========================================================
+C     mul_1 for vp[0] if vn is odd
+
+L(mul_1):
+	and	$3, R32(w0)
+	jz	L(mul_1_prologue_0)
+	cmp	$2, R32(w0)
+	jc	L(mul_1_prologue_1)
+	jz	L(mul_1_prologue_2)
+
+L(mul_1_prologue_3):
+	add	$-1, n
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	mov	%rax, w3
+	mov	%rdx, w0
+	jmp	L(mul_1_entry_3)
+
+L(mul_1_prologue_0):
+	mov	%rax, w2
+	mov	%rdx, w3		C note: already w0 == 0
+	lea	L(addmul_outer_0)(%rip), outer_addr
+	jmp	L(mul_1_entry_0)
+
+L(mul_1_prologue_1):
+	cmp	$-1, un
+	jne	2f
+	mov	%rax, -8(rp)
+	mov	%rdx, (rp)
+	jmp	L(ret)
+2:	add	$1, n
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	mov	%rax, w1
+	mov	%rdx, w2
+	xor	R32(w3), R32(w3)
+	mov	(up,n,8), %rax
+	jmp	L(mul_1_entry_1)
+
+L(mul_1_prologue_2):
+	add	$-2, n
+	lea	L(addmul_outer_2)(%rip), outer_addr
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	24(up,n,8), %rax
+	xor	R32(w2), R32(w2)
+	xor	R32(w3), R32(w3)
+	jmp	L(mul_1_entry_2)
+
+
+	C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
+
+	ALIGN(16)
+L(mul_1_top):
+	mov	w0, -16(rp,n,8)
+	add	%rax, w1
+	mov	(up,n,8), %rax
+	adc	%rdx, w2
+L(mul_1_entry_1):
+	xor	R32(w0), R32(w0)
+	mul	v0
+	mov	w1, -8(rp,n,8)
+	add	%rax, w2
+	adc	%rdx, w3
+L(mul_1_entry_0):
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	w2, (rp,n,8)
+	add	%rax, w3
+	adc	%rdx, w0
+L(mul_1_entry_3):
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	w3, 8(rp,n,8)
+	xor	R32(w2), R32(w2)	C zero
+	mov	w2, w3			C zero
+	add	%rax, w0
+	mov	24(up,n,8), %rax
+	mov	w2, w1			C zero
+	adc	%rdx, w1
+L(mul_1_entry_2):
+	mul	v0
+	add	$4, n
+	js	L(mul_1_top)
+
+	mov	w0, -16(rp)
+	add	%rax, w1
+	mov	w1, -8(rp)
+	adc	%rdx, w2
+	mov	w2, (rp)
+
+	add	$-1, vn			C vn -= 1
+	jz	L(ret)
+
+	mov	8(vp), v0
+	mov	16(vp), v1
+
+	lea	8(vp), vp		C vp += 1
+	lea	8(rp), rp		C rp += 1
+
+	jmp	*outer_addr
+
+C ===========================================================
+C     mul_2 for vp[0], vp[1] if vn is even
+
+	ALIGN(16)
+L(mul_2):
+	mov	8(vp), v1
+
+	and	$3, R32(w0)
+	jz	L(mul_2_prologue_0)
+	cmp	$2, R32(w0)
+	jz	L(mul_2_prologue_2)
+	jc	L(mul_2_prologue_1)
+
+L(mul_2_prologue_3):
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	add	$2, n
+	mov	%rax, -16(rp,n,8)
+	mov	%rdx, w2
+	xor	R32(w3), R32(w3)
+	xor	R32(w0), R32(w0)
+	mov	-16(up,n,8), %rax
+	jmp	L(mul_2_entry_3)
+
+	ALIGN(16)
+L(mul_2_prologue_0):
+	add	$3, n
+	mov	%rax, w0
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mov	-24(up,n,8), %rax
+	lea	L(addmul_outer_0)(%rip), outer_addr
+	jmp	L(mul_2_entry_0)
+
+	ALIGN(16)
+L(mul_2_prologue_1):
+	mov	%rax, w3
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	jmp	L(mul_2_entry_1)
+
+	ALIGN(16)
+L(mul_2_prologue_2):
+	add	$1, n
+	lea	L(addmul_outer_2)(%rip), outer_addr
+	mov	$0, R32(w0)
+	mov	$0, R32(w1)
+	mov	%rax, w2
+	mov	-8(up,n,8), %rax
+	mov	%rdx, w3
+	jmp	L(mul_2_entry_2)
+
+	C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
+
+	ALIGN(16)
+L(mul_2_top):
+	mov	-32(up,n,8), %rax
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	-24(up,n,8), %rax
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,n,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+L(mul_2_entry_0):
+	mul	v1
+	add	%rax, w1
+	mov	w0, -24(rp,n,8)
+	adc	%rdx, w2
+	mov	-16(up,n,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	-16(up,n,8), %rax
+	adc	$0, R32(w3)
+	mov	$0, R32(w0)
+	mov	w1, -16(rp,n,8)
+L(mul_2_entry_3):
+	mul	v1
+	add	%rax, w2
+	mov	-8(up,n,8), %rax
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w2
+	mov	-8(up,n,8), %rax
+	adc	%rdx, w3
+	adc	R32(w1), R32(w0)	C adc $0, w0
+L(mul_2_entry_2):
+	mul	v1
+	add	%rax, w3
+	mov	w2, -8(rp,n,8)
+	adc	%rdx, w0
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(mul_2_entry_1):
+	add	$4, n
+	mov	w3, -32(rp,n,8)
+	js	L(mul_2_top)
+
+	mov	-32(up,n,8), %rax	C FIXME: n is constant
+	mul	v1
+	add	%rax, w0
+	mov	w0, (rp)
+	adc	%rdx, w1
+	mov	w1, 8(rp)
+
+	add	$-2, vn			C vn -= 2
+	jz	L(ret)
+
+	mov	16(vp), v0
+	mov	24(vp), v1
+
+	lea	16(vp), vp		C vp += 2
+	lea	16(rp), rp		C rp += 2
+
+	jmp	*outer_addr
+
+
+C ===========================================================
+C     addmul_2 for remaining vp's
+
+	C in the following prologues, we reuse un to store the
+	C adjusted value of n that is reloaded on each iteration
+
+L(addmul_outer_0):
+	add	$3, un
+	lea	0(%rip), outer_addr
+
+	mov	un, n
+	mov	-24(up,un,8), %rax
+	mul	v0
+	mov	%rax, w0
+	mov	-24(up,un,8), %rax
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	jmp	L(addmul_entry_0)
+
+L(addmul_outer_1):
+	mov	un, n
+	mov	(up,un,8), %rax
+	mul	v0
+	mov	%rax, w3
+	mov	(up,un,8), %rax
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	jmp	L(addmul_entry_1)
+
+L(addmul_outer_2):
+	add	$1, un
+	lea	0(%rip), outer_addr
+
+	mov	un, n
+	mov	-8(up,un,8), %rax
+	mul	v0
+	xor	R32(w0), R32(w0)
+	mov	%rax, w2
+	xor	R32(w1), R32(w1)
+	mov	%rdx, w3
+	mov	-8(up,un,8), %rax
+	jmp	L(addmul_entry_2)
+
+L(addmul_outer_3):
+	add	$2, un
+	lea	0(%rip), outer_addr
+
+	mov	un, n
+	mov	-16(up,un,8), %rax
+	xor	R32(w3), R32(w3)
+	mul	v0
+	mov	%rax, w1
+	mov	-16(up,un,8), %rax
+	mov	%rdx, w2
+	jmp	L(addmul_entry_3)
+
+	C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
+
+	ALIGN(16)
+L(addmul_top):
+	add	w3, -32(rp,n,8)
+	adc	%rax, w0
+	mov	-24(up,n,8), %rax
+	adc	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,n,8), %rax
+	adc	%rdx, w1
+	adc	R32(w2), R32(w2)	C adc $0, w2
+L(addmul_entry_0):
+	mul	v1
+	xor	R32(w3), R32(w3)
+	add	w0, -24(rp,n,8)
+	adc	%rax, w1
+	mov	-16(up,n,8), %rax
+	adc	%rdx, w2
+	mul	v0
+	add	%rax, w1
+	mov	-16(up,n,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+L(addmul_entry_3):
+	mul	v1
+	add	w1, -16(rp,n,8)
+	adc	%rax, w2
+	mov	-8(up,n,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	xor	R32(w0), R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mov	-8(up,n,8), %rax
+	adc	R32(w1), R32(w0)	C adc $0, w0
+L(addmul_entry_2):
+	mul	v1
+	add	w2, -8(rp,n,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	(up,n,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(addmul_entry_1):
+	mul	v1
+	add	$4, n
+	js	L(addmul_top)
+
+	add	w3, -8(rp)
+	adc	%rax, w0
+	mov	w0, (rp)
+	adc	%rdx, w1
+	mov	w1, 8(rp)
+
+	add	$-2, vn			C vn -= 2
+	jz	L(ret)
+
+	lea	16(rp), rp		C rp += 2
+	lea	16(vp), vp		C vp += 2
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	jmp	*outer_addr
+
+	ALIGN(16)
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm
new file mode 100644
index 0000000..fa00f42
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm
@@ -0,0 +1,436 @@
+dnl  AMD64 mpn_mullo_basecase.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
+C     large trip count.  Instead, we should start with mul_2 for any operand
+C     size congruence class.
+C   * Stop iterating addmul_2 earlier, falling into straight-line triangle code
+C     for the last 2-3 iterations.
+C   * Perhaps implement n=4 special code.
+C   * The reload of the outer loop jump address hurts branch prediction.
+C   * The addmul_2 loop ends with an MUL whose high part is not used upon loop
+C     exit.
+
+C INPUT PARAMETERS
+define(`rp',	   `%rdi')
+define(`up',	   `%rsi')
+define(`vp_param', `%rdx')
+define(`n',	   `%rcx')
+
+define(`vp',	`%r11')
+define(`outer_addr', `%r8')
+define(`j',	`%r9')
+define(`v0',	`%r13')
+define(`v1',	`%r14')
+define(`w0',	`%rbx')
+define(`w1',	`%r15')
+define(`w2',	`%rbp')
+define(`w3',	`%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mullo_basecase)
+	FUNC_ENTRY(4)
+	cmp	$4, n
+	jge	L(gen)
+	mov	(up), %rax		C u0
+	mov	(vp_param), %r8		C v0
+
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',
+`	movslq	(%r9,%rcx,4), %r10
+	add	%r10, %r9
+	jmp	*%r9
+',`
+	jmp	*(%r9,n,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(tab), L(tab))			C not allowed
+	JMPENT(	L(1), L(tab))			C 1
+	JMPENT(	L(2), L(tab))			C 2
+	JMPENT(	L(3), L(tab))			C 3
+dnl	JMPENT(	L(0m4), L(tab))			C 4
+dnl	JMPENT(	L(1m4), L(tab))			C 5
+dnl	JMPENT(	L(2m4), L(tab))			C 6
+dnl	JMPENT(	L(3m4), L(tab))			C 7
+dnl	JMPENT(	L(0m4), L(tab))			C 8
+dnl	JMPENT(	L(1m4), L(tab))			C 9
+dnl	JMPENT(	L(2m4), L(tab))			C 10
+dnl	JMPENT(	L(3m4), L(tab))			C 11
+	TEXT
+
+L(1):	imul	%r8, %rax
+	mov	%rax, (rp)
+	FUNC_EXIT()
+	ret
+
+L(2):	mov	8(vp_param), %r11
+	imul	%rax, %r11		C u0 x v1
+	mul	%r8			C u0 x v0
+	mov	%rax, (rp)
+	imul	8(up), %r8		C u1 x v0
+	lea	(%r11, %rdx), %rax
+	add	%r8, %rax
+	mov	%rax, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(3):	mov	8(vp_param), %r9	C v1
+	mov	16(vp_param), %r11
+	mul	%r8			C u0 x v0 -> <r1,r0>
+	mov	%rax, (rp)		C r0
+	mov	(up), %rax		C u0
+	mov	%rdx, %rcx		C r1
+	mul	%r9			C u0 x v1 -> <r2,r1>
+	imul	8(up), %r9		C u1 x v1 -> r2
+	mov	16(up), %r10
+	imul	%r8, %r10		C u2 x v0 -> r2
+	add	%rax, %rcx
+	adc	%rdx, %r9
+	add	%r10, %r9
+	mov	8(up), %rax		C u1
+	mul	%r8			C u1 x v0 -> <r2,r1>
+	add	%rax, %rcx
+	adc	%rdx, %r9
+	mov	%r11, %rax
+	imul	(up), %rax		C u0 x v2 -> r2
+	add	%rax, %r9
+	mov	%rcx, 8(rp)
+	mov	%r9, 16(rp)
+	FUNC_EXIT()
+	ret
+
+L(0m4):
+L(1m4):
+L(2m4):
+L(3m4):
+L(gen):	push	%rbx
+	push	%rbp
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	(up), %rax
+	mov	(vp_param), v0
+	mov	vp_param, vp
+
+	lea	(rp,n,8), rp
+	lea	(up,n,8), up
+	neg	n
+
+	mul	v0
+
+	test	$1, R8(n)
+	jz	L(mul_2)
+
+L(mul_1):
+	lea	-8(rp), rp
+	lea	-8(up), up
+	test	$2, R8(n)
+	jnz	L(mul_1_prologue_3)
+
+L(mul_1_prologue_2):		C n = 7, 11, 15, ...
+	lea	-1(n), j
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	mov	%rax, w0
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	xor	R32(w3), R32(w3)
+	mov	16(up,n,8), %rax
+	jmp	L(mul_1_entry_2)
+
+L(mul_1_prologue_3):		C n = 5, 9, 13, ...
+	lea	1(n), j
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	mov	%rax, w2
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	jmp	L(mul_1_entry_0)
+
+	ALIGN(16)
+L(mul_1_top):
+	mov	w0, -16(rp,j,8)
+	add	%rax, w1
+	mov	(up,j,8), %rax
+	adc	%rdx, w2
+	xor	R32(w0), R32(w0)
+	mul	v0
+	mov	w1, -8(rp,j,8)
+	add	%rax, w2
+	adc	%rdx, w3
+L(mul_1_entry_0):
+	mov	8(up,j,8), %rax
+	mul	v0
+	mov	w2, (rp,j,8)
+	add	%rax, w3
+	adc	%rdx, w0
+	mov	16(up,j,8), %rax
+	mul	v0
+	mov	w3, 8(rp,j,8)
+	xor	R32(w2), R32(w2)	C zero
+	mov	w2, w3			C zero
+	add	%rax, w0
+	mov	24(up,j,8), %rax
+	mov	w2, w1			C zero
+	adc	%rdx, w1
+L(mul_1_entry_2):
+	mul	v0
+	add	$4, j
+	js	L(mul_1_top)
+
+	mov	w0, -16(rp)
+	add	%rax, w1
+	mov	w1, -8(rp)
+	adc	%rdx, w2
+
+	imul	(up), v0
+	add	v0, w2
+	mov	w2, (rp)
+
+	add	$1, n
+	jz	L(ret)
+
+	mov	8(vp), v0
+	mov	16(vp), v1
+
+	lea	16(up), up
+	lea	8(vp), vp
+	lea	24(rp), rp
+
+	jmp	*outer_addr
+
+
+L(mul_2):
+	mov	8(vp), v1
+	test	$2, R8(n)
+	jz	L(mul_2_prologue_3)
+
+	ALIGN(16)
+L(mul_2_prologue_1):
+	lea	0(n), j
+	mov	%rax, w3
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	mov	(up,n,8), %rax
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	jmp	L(mul_2_entry_1)
+
+	ALIGN(16)
+L(mul_2_prologue_3):
+	lea	2(n), j
+	mov	$0, R32(w3)
+	mov	%rax, w1
+	mov	(up,n,8), %rax
+	mov	%rdx, w2
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	jmp	L(mul_2_entry_3)
+
+	ALIGN(16)
+L(mul_2_top):
+	mov	-32(up,j,8), %rax
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	-24(up,j,8), %rax
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1
+	add	%rax, w1
+	mov	w0, -24(rp,j,8)
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	adc	$0, R32(w3)
+L(mul_2_entry_3):
+	mov	$0, R32(w0)
+	mov	w1, -16(rp,j,8)
+	mul	v1
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	adc	R32(w1), R32(w0)
+	mul	v1
+	add	%rax, w3
+	mov	w2, -8(rp,j,8)
+	adc	%rdx, w0
+	mov	(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(mul_2_entry_1):
+	add	$4, j
+	mov	w3, -32(rp,j,8)
+	js	L(mul_2_top)
+
+	imul	-16(up), v1
+	add	v1, w0
+	imul	-8(up), v0
+	add	v0, w0
+	mov	w0, -8(rp)
+
+	add	$2, n
+	jz	L(ret)
+
+	mov	16(vp), v0
+	mov	24(vp), v1
+
+	lea	16(vp), vp
+	lea	16(rp), rp
+
+	jmp	*outer_addr
+
+
+L(addmul_outer_1):
+	lea	-2(n), j
+	mov	-16(up,n,8), %rax
+	mul	v0
+	mov	%rax, w3
+	mov	-16(up,n,8), %rax
+	mov	%rdx, w0
+	xor	R32(w1), R32(w1)
+	lea	L(addmul_outer_3)(%rip), outer_addr
+	jmp	L(addmul_entry_1)
+
+L(addmul_outer_3):
+	lea	0(n), j
+	mov	-16(up,n,8), %rax
+	xor	R32(w3), R32(w3)
+	mul	v0
+	mov	%rax, w1
+	mov	-16(up,n,8), %rax
+	mov	%rdx, w2
+	lea	L(addmul_outer_1)(%rip), outer_addr
+	jmp	L(addmul_entry_3)
+
+	ALIGN(16)
+L(addmul_top):
+	add	w3, -32(rp,j,8)
+	adc	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	adc	R32(w2), R32(w2)
+	mul	v1
+	xor	R32(w3), R32(w3)
+	add	w0, -24(rp,j,8)
+	adc	%rax, w1
+	mov	-16(up,j,8), %rax
+	adc	%rdx, w2
+	mul	v0
+	add	%rax, w1
+	mov	-16(up,j,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+L(addmul_entry_3):
+	mul	v1
+	add	w1, -16(rp,j,8)
+	adc	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	xor	R32(w0), R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mov	-8(up,j,8), %rax
+	adc	R32(w1), R32(w0)
+	mul	v1
+	add	w2, -8(rp,j,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+	mov	(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	(up,j,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+L(addmul_entry_1):
+	mul	v1
+	add	$4, j
+	js	L(addmul_top)
+
+	add	w3, -32(rp)
+	adc	%rax, w0
+
+	imul	-24(up), v0
+	add	v0, w0
+	add	w0, -24(rp)
+
+	add	$2, n
+	jns	L(ret)
+
+	lea	16(vp), vp
+
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	lea	-16(up), up
+
+	jmp	*outer_addr
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm
new file mode 100644
index 0000000..86f1414
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm
@@ -0,0 +1,559 @@
+dnl  AMD64 mpn_mulmid_basecase
+
+dnl  Contributed by David Harvey.
+
+dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C	     cycles/limb
+C K8,K9:	 2.375  (2.5 when un - vn is "small")
+C K10:		 ?
+C P4:		 ?
+C P6-15:	 ?
+
+C INPUT PARAMETERS
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn',      `%r8')
+
+define(`v0', `%r12')
+define(`v1', `%r9')
+
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`n',  `%r11')
+define(`outer_addr', `%r14')
+define(`un',  `%r13')
+define(`vp',  `%r15')
+
+define(`vp_inner', `%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mulmid_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	vp_param, vp
+
+	C use un for row length (= un_param - vn + 1)
+	lea	1(un_param), un
+	sub	vn, un
+
+	lea	(rp,un,8), rp
+
+	cmp	$4, un		C TODO: needs tuning
+	jc	L(diagonal)
+
+	lea	(up,un_param,8), up
+
+	test	$1, vn
+	jz	L(mul_2)
+
+C ===========================================================
+C     mul_1 for vp[0] if vn is odd
+
+L(mul_1):
+	mov	R32(un), R32(w0)
+
+	neg	un
+	mov	(up,un,8), %rax
+	mov	(vp), v0
+	mul	v0
+
+	and	$-4, un		C round down to multiple of 4
+	mov	un, n
+
+	and	$3, R32(w0)
+	jz	L(mul_1_prologue_0)
+	cmp	$2, R32(w0)
+	jc	L(mul_1_prologue_1)
+	jz	L(mul_1_prologue_2)
+
+L(mul_1_prologue_3):
+	mov	%rax, w3
+	mov	%rdx, w0
+	lea	L(addmul_prologue_3)(%rip), outer_addr
+	jmp	L(mul_1_entry_3)
+
+	ALIGN(16)
+L(mul_1_prologue_0):
+	mov	%rax, w2
+	mov	%rdx, w3		C note already w0 == 0
+	lea	L(addmul_prologue_0)(%rip), outer_addr
+	jmp	L(mul_1_entry_0)
+
+	ALIGN(16)
+L(mul_1_prologue_1):
+	add	$4, n
+	mov	%rax, w1
+	mov	%rdx, w2
+	mov	$0, R32(w3)
+	mov	(up,n,8), %rax
+	lea	L(addmul_prologue_1)(%rip), outer_addr
+	jmp	L(mul_1_entry_1)
+
+	ALIGN(16)
+L(mul_1_prologue_2):
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	24(up,n,8), %rax
+	mov	$0, R32(w2)
+	mov	$0, R32(w3)
+	lea	L(addmul_prologue_2)(%rip), outer_addr
+	jmp	L(mul_1_entry_2)
+
+
+	C this loop is 10 c/loop = 2.5 c/l on K8
+
+	ALIGN(16)
+L(mul_1_top):
+	mov	w0, -16(rp,n,8)
+	add	%rax, w1
+	mov	(up,n,8), %rax
+	adc	%rdx, w2
+L(mul_1_entry_1):
+	mov	$0, R32(w0)
+	mul	v0
+	mov	w1, -8(rp,n,8)
+	add	%rax, w2
+	adc	%rdx, w3
+L(mul_1_entry_0):
+	mov	8(up,n,8), %rax
+	mul	v0
+	mov	w2, (rp,n,8)
+	add	%rax, w3
+	adc	%rdx, w0
+L(mul_1_entry_3):
+	mov	16(up,n,8), %rax
+	mul	v0
+	mov	w3, 8(rp,n,8)
+	mov	$0, R32(w2)		C zero
+	mov	w2, w3			C zero
+	add	%rax, w0
+	mov	24(up,n,8), %rax
+	mov	w2, w1			C zero
+	adc	%rdx, w1
+L(mul_1_entry_2):
+	mul	v0
+	add	$4, n
+	js	L(mul_1_top)
+
+	mov	w0, -16(rp)
+	add	%rax, w1
+	mov	w1, -8(rp)
+	mov	w2, 8(rp)		C zero last limb of output
+	adc	%rdx, w2
+	mov	w2, (rp)
+
+	dec	vn
+	jz	L(ret)
+
+	lea	-8(up), up
+	lea	8(vp), vp
+
+	mov	un, n
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	jmp	*outer_addr
+
+C ===========================================================
+C     mul_2 for vp[0], vp[1] if vn is even
+
+	ALIGN(16)
+L(mul_2):
+	mov	R32(un), R32(w0)
+
+	neg	un
+	mov	-8(up,un,8), %rax
+	mov	(vp), v0
+	mov	8(vp), v1
+	mul	v1
+
+	and	$-4, un		C round down to multiple of 4
+	mov	un, n
+
+	and	$3, R32(w0)
+	jz	L(mul_2_prologue_0)
+	cmp	$2, R32(w0)
+	jc	L(mul_2_prologue_1)
+	jz	L(mul_2_prologue_2)
+
+L(mul_2_prologue_3):
+	mov	%rax, w1
+	mov	%rdx, w2
+	lea	L(addmul_prologue_3)(%rip), outer_addr
+	jmp	L(mul_2_entry_3)
+
+	ALIGN(16)
+L(mul_2_prologue_0):
+	mov	%rax, w0
+	mov	%rdx, w1
+	lea	L(addmul_prologue_0)(%rip), outer_addr
+	jmp	L(mul_2_entry_0)
+
+	ALIGN(16)
+L(mul_2_prologue_1):
+	mov	%rax, w3
+	mov	%rdx, w0
+	mov	$0, R32(w1)
+	lea	L(addmul_prologue_1)(%rip), outer_addr
+	jmp	L(mul_2_entry_1)
+
+	ALIGN(16)
+L(mul_2_prologue_2):
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	$0, R32(w0)
+	mov	16(up,n,8), %rax
+	lea	L(addmul_prologue_2)(%rip), outer_addr
+	jmp	L(mul_2_entry_2)
+
+
+	C this loop is 18 c/loop = 2.25 c/l on K8
+
+	ALIGN(16)
+L(mul_2_top):
+	mov     -8(up,n,8), %rax
+	mul     v1
+	add     %rax, w0
+	adc     %rdx, w1
+L(mul_2_entry_0):
+	mov     $0, R32(w2)
+	mov     (up,n,8), %rax
+	mul     v0
+	add     %rax, w0
+	mov     (up,n,8), %rax
+	adc     %rdx, w1
+	adc     $0, R32(w2)
+	mul     v1
+	add     %rax, w1
+	mov     w0, (rp,n,8)
+	adc     %rdx, w2
+L(mul_2_entry_3):
+	mov     8(up,n,8), %rax
+	mul     v0
+	mov     $0, R32(w3)
+	add     %rax, w1
+	adc     %rdx, w2
+	mov     $0, R32(w0)
+	adc     $0, R32(w3)
+	mov     8(up,n,8), %rax
+	mov     w1, 8(rp,n,8)
+	mul     v1
+	add     %rax, w2
+	mov     16(up,n,8), %rax
+	adc     %rdx, w3
+L(mul_2_entry_2):
+	mov     $0, R32(w1)
+	mul     v0
+	add     %rax, w2
+	mov     16(up,n,8), %rax
+	adc     %rdx, w3
+	adc     $0, R32(w0)
+	mul     v1
+	add     %rax, w3
+	mov     w2, 16(rp,n,8)
+	adc     %rdx, w0
+L(mul_2_entry_1):
+	mov     24(up,n,8), %rax
+	mul     v0
+	add     %rax, w3
+	adc     %rdx, w0
+	adc     $0, R32(w1)
+	add     $4, n
+	mov     w3, -8(rp,n,8)
+	jnz     L(mul_2_top)
+
+	mov	w0, (rp)
+	mov	w1, 8(rp)
+
+	sub	$2, vn
+	jz	L(ret)
+
+	lea	16(vp), vp
+	lea	-16(up), up
+
+	mov	un, n
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	jmp	*outer_addr
+
+C ===========================================================
+C     addmul_2 for remaining vp's
+
+	ALIGN(16)
+L(addmul_prologue_0):
+	mov	-8(up,n,8), %rax
+	mul	v1
+	mov	%rax, w1
+	mov	%rdx, w2
+	mov	$0, R32(w3)
+	jmp	L(addmul_entry_0)
+
+	ALIGN(16)
+L(addmul_prologue_1):
+	mov	16(up,n,8), %rax
+	mul	v1
+	mov	%rax, w0
+	mov	%rdx, w1
+	mov	$0, R32(w2)
+	mov	24(up,n,8), %rax
+	jmp	L(addmul_entry_1)
+
+	ALIGN(16)
+L(addmul_prologue_2):
+	mov	8(up,n,8), %rax
+	mul	v1
+	mov	%rax, w3
+	mov	%rdx, w0
+	mov	$0, R32(w1)
+	jmp	L(addmul_entry_2)
+
+	ALIGN(16)
+L(addmul_prologue_3):
+	mov	(up,n,8), %rax
+	mul	v1
+	mov	%rax, w2
+	mov	%rdx, w3
+	mov	$0, R32(w0)
+	mov	$0, R32(w1)
+	jmp	L(addmul_entry_3)
+
+	C this loop is 19 c/loop = 2.375 c/l on K8
+
+	ALIGN(16)
+L(addmul_top):
+	mov	$0, R32(w3)
+	add	%rax, w0
+	mov	-8(up,n,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1
+	add	w0, -8(rp,n,8)
+	adc	%rax, w1
+	adc	%rdx, w2
+L(addmul_entry_0):
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, w1
+	mov	(up,n,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mul	v1
+	add	w1, (rp,n,8)
+	mov	$0, R32(w1)
+	adc	%rax, w2
+	mov	$0, R32(w0)
+	adc	%rdx, w3
+L(addmul_entry_3):
+	mov	8(up,n,8), %rax
+	mul	v0
+	add	%rax, w2
+	mov	8(up,n,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	add	w2, 8(rp,n,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+L(addmul_entry_2):
+	mov	16(up,n,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	16(up,n,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	w3, 16(rp,n,8)
+	nop			C don't ask...
+	adc	%rax, w0
+	mov	$0, R32(w2)
+	mov	24(up,n,8), %rax
+	adc	%rdx, w1
+L(addmul_entry_1):
+	mul	v0
+	add	$4, n
+	jnz	L(addmul_top)
+
+	add	%rax, w0
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+
+	add	w0, -8(rp)
+	adc	w1, (rp)
+	adc	w2, 8(rp)
+
+	sub	$2, vn
+	jz	L(ret)
+
+	lea	16(vp), vp
+	lea	-16(up), up
+
+	mov	un, n
+	mov	(vp), v0
+	mov	8(vp), v1
+
+	jmp	*outer_addr
+
+C ===========================================================
+C     accumulate along diagonals if un - vn is small
+
+	ALIGN(16)
+L(diagonal):
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	xor	R32(w2), R32(w2)
+
+	neg	un
+
+	mov	R32(vn), %eax
+	and	$3, %eax
+	jz	L(diag_prologue_0)
+	cmp	$2, %eax
+	jc	L(diag_prologue_1)
+	jz	L(diag_prologue_2)
+
+L(diag_prologue_3):
+	lea	-8(vp), vp
+	mov	vp, vp_inner
+	add	$1, vn
+	mov	vn, n
+	lea	L(diag_entry_3)(%rip), outer_addr
+	jmp	L(diag_entry_3)
+
+L(diag_prologue_0):
+	mov	vp, vp_inner
+	mov	vn, n
+	lea	0(%rip), outer_addr
+	mov     -8(up,n,8), %rax
+	jmp	L(diag_entry_0)
+
+L(diag_prologue_1):
+	lea	8(vp), vp
+	mov	vp, vp_inner
+	add	$3, vn
+	mov	vn, n
+	lea	0(%rip), outer_addr
+	mov     -8(vp_inner), %rax
+	jmp	L(diag_entry_1)
+
+L(diag_prologue_2):
+	lea	-16(vp), vp
+	mov	vp, vp_inner
+	add	$2, vn
+	mov	vn, n
+	lea	0(%rip), outer_addr
+	mov	16(vp_inner), %rax
+	jmp	L(diag_entry_2)
+
+
+	C this loop is 10 c/loop = 2.5 c/l on K8
+
+	ALIGN(16)
+L(diag_top):
+	add     %rax, w0
+	adc     %rdx, w1
+	mov     -8(up,n,8), %rax
+	adc     $0, w2
+L(diag_entry_0):
+	mulq    (vp_inner)
+	add     %rax, w0
+	adc     %rdx, w1
+	adc     $0, w2
+L(diag_entry_3):
+	mov     -16(up,n,8), %rax
+	mulq    8(vp_inner)
+	add     %rax, w0
+	mov     16(vp_inner), %rax
+	adc     %rdx, w1
+	adc     $0, w2
+L(diag_entry_2):
+	mulq    -24(up,n,8)
+	add     %rax, w0
+	mov     24(vp_inner), %rax
+	adc     %rdx, w1
+	lea     32(vp_inner), vp_inner
+	adc     $0, w2
+L(diag_entry_1):
+	mulq    -32(up,n,8)
+	sub     $4, n
+	jnz	L(diag_top)
+
+	add	%rax, w0
+	adc	%rdx, w1
+	adc	$0, w2
+
+	mov	w0, (rp,un,8)
+
+	inc	un
+	jz	L(diag_end)
+
+	mov	vn, n
+	mov	vp, vp_inner
+
+	lea	8(up), up
+	mov	w1, w0
+	mov	w2, w1
+	xor	R32(w2), R32(w2)
+
+	jmp	*outer_addr
+
+L(diag_end):
+	mov	w1, (rp)
+	mov	w2, 8(rp)
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm
new file mode 100644
index 0000000..9327b21
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm
@@ -0,0 +1,591 @@
+dnl  X86-64 mpn_redc_1 optimised for AMD K8-K10.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2004, 2008, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bull	 ?
+C AMD pile	 ?
+C AMD steam	 ?
+C AMD bobcat	 ?
+C AMD jaguar	 ?
+C Intel P4	 ?
+C Intel core	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel atom	 ?
+C VIA nano	 ?
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
+
+C TODO
+C  * Micro-optimise, none performed thus far.
+C  * This looks different from other current redc_1.asm variants.  Consider
+C    adapting this to the mainstream style.
+C  * Is this code really faster than more approaches which compute q0 later?
+C    Is the use of a jump jump table faster?  Or is the edge of this due to the
+C    inlined add_n code?
+C  * Put initial m[0] x q0 computation in header.
+C  * Put basecases at the file's end, single them out before the pushes.
+
+define(`rp',          `%rdi')   C rcx
+define(`up',          `%rsi')   C rdx
+define(`mp_param',    `%rdx')   C r8
+define(`n',           `%rcx')   C r9
+define(`u0inv',       `%r8')    C stack
+
+define(`i',           `%r11')
+define(`nneg',        `%r12')
+define(`mp',          `%r13')
+define(`q0',          `%rbp')
+define(`vp',          `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_redc_1)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+	push	%rbp
+	mov	(up), q0		C up[0]
+	push	%rbx
+	imul	u0inv, q0		C first q0, for all execution paths
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	n, nneg
+	neg	nneg
+	lea	(mp_param,n,8), mp	C mp += n
+	lea	-16(up,n,8), up		C up += n
+
+	mov	R32(n), R32(%rax)
+	and	$3, R32(%rax)
+	lea	4(%rax), %r9
+	cmp	$4, R32(n)
+	cmovg	%r9, %rax
+	lea	L(tab)(%rip), %r9
+ifdef(`PIC',`
+	movslq	(%r9,%rax,4), %rax
+	add	%r9, %rax
+	jmp	*%rax
+',`
+	jmp	*(%r9,%rax,8)
+')
+
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(0), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
+	JMPENT(	L(0m4), L(tab))
+	JMPENT(	L(1m4), L(tab))
+	JMPENT(	L(2m4), L(tab))
+	JMPENT(	L(3m4), L(tab))
+	TEXT
+
+	ALIGN(16)
+L(1):	mov	(mp_param), %rax
+	mul	q0
+	add	8(up), %rax
+	adc	16(up), %rdx
+	mov	%rdx, (rp)
+	mov	$0, R32(%rax)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+
+	ALIGN(16)
+L(2):	mov	(mp_param), %rax
+	mul	q0
+	xor	R32(%r14), R32(%r14)
+	mov	%rax, %r10
+	mov	-8(mp), %rax
+	mov	%rdx, %r9
+	mul	q0
+	add	(up), %r10
+	adc	%rax, %r9
+	adc	%rdx, %r14
+	add	8(up), %r9
+	adc	$0, %r14
+	mov	%r9, q0
+	imul	u0inv, q0
+	mov	-16(mp), %rax
+	mul	q0
+	xor	R32(%rbx), R32(%rbx)
+	mov	%rax, %r10
+	mov	-8(mp), %rax
+	mov	%rdx, %r11
+	mul	q0
+	add	%r9, %r10
+	adc	%rax, %r11
+	adc	%rdx, %rbx
+	add	16(up), %r11
+	adc	$0, %rbx
+	xor	R32(%rax), R32(%rax)
+	add	%r11, %r14
+	adc	24(up), %rbx
+	mov	%r14, (rp)
+	mov	%rbx, 8(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+
+L(3):	mov	(mp_param), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	mov	-16(mp), %rax
+	mul	q0
+	xor	R32(%r9), R32(%r9)
+	xor	R32(%r14), R32(%r14)
+	add	-8(up), %rbx
+	adc	%rax, %r10
+	mov	-8(mp), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	(up), %r10
+	mov	%r10, (up)
+	adc	%rax, %r9
+	adc	%rdx, %r14
+	mov	%r10, q0
+	imul	u0inv, q0
+	add	%r9, 8(up)
+	adc	$0, %r14
+	mov	%r14, -8(up)
+
+	mov	-24(mp), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	mov	-16(mp), %rax
+	mul	q0
+	xor	R32(%r9), R32(%r9)
+	xor	R32(%r14), R32(%r14)
+	add	(up), %rbx
+	adc	%rax, %r10
+	mov	-8(mp), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	8(up), %r10
+	mov	%r10, 8(up)
+	adc	%rax, %r9
+	adc	%rdx, %r14
+	mov	%r10, q0
+	imul	u0inv, q0
+	add	%r9, 16(up)
+	adc	$0, %r14
+	mov	%r14, (up)
+
+	mov	-24(mp), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	mov	-16(mp), %rax
+	mul	q0
+	xor	R32(%r9), R32(%r9)
+	xor	R32(%r14), R32(%r14)
+	add	8(up), %rbx
+	adc	%rax, %r10
+	mov	-8(mp), %rax
+	adc	%rdx, %r9
+	mul	q0
+	add	16(up), %r10
+	adc	%rax, %r9
+	adc	%rdx, %r14
+	add	24(up), %r9
+	adc	$0, %r14
+
+	xor	R32(%rax), R32(%rax)
+	add	-8(up), %r10
+	adc	(up), %r9
+	adc	32(up), %r14
+	mov	%r10, (rp)
+	mov	%r9, 8(rp)
+	mov	%r14, 16(rp)
+	adc	R32(%rax), R32(%rax)
+	jmp	L(ret)
+
+
+	ALIGN(16)
+L(2m4):
+L(lo2):	mov	(mp,nneg,8), %rax
+	mul	q0
+	xor	R32(%r14), R32(%r14)
+	xor	R32(%rbx), R32(%rbx)
+	mov	%rax, %r10
+	mov	8(mp,nneg,8), %rax
+	mov	24(up,nneg,8), %r15
+	mov	%rdx, %r9
+	mul	q0
+	add	16(up,nneg,8), %r10
+	adc	%rax, %r9
+	mov	16(mp,nneg,8), %rax
+	adc	%rdx, %r14
+	mul	q0
+	mov	$0, R32(%r10)		C xor?
+	lea	2(nneg), i
+	add	%r9, %r15
+	imul	u0inv, %r15
+	jmp	 L(e2)
+
+	ALIGN(16)
+L(li2):	add	%r10, (up,i,8)
+	adc	%rax, %r9
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r14
+	xor	R32(%r10), R32(%r10)
+	mul	q0
+L(e2):	add	%r9, 8(up,i,8)
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	8(mp,i,8), %rax
+	mul	q0
+	add	%r14, 16(up,i,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+	mov	16(mp,i,8), %rax
+	mul	q0
+	add	%rbx, 24(up,i,8)
+	mov	$0, R32(%r14)		C zero
+	mov	%r14, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(mp,i,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	mul	q0
+	add	$4, i
+	js	 L(li2)
+
+L(le2):	add	%r10, (up)
+	adc	%rax, %r9
+	adc	%r14, %rdx
+	add	%r9, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, 16(up,nneg,8)	C up[0]
+	add	$8, up
+	mov	%r15, q0
+	dec	n
+	jnz	L(lo2)
+
+	mov	nneg, n
+	sar	$2, n
+	lea	32(up,nneg,8), up
+	lea	(up,nneg,8), vp
+
+	mov	-16(up), %r8
+	mov	-8(up), %r9
+	add	-16(vp), %r8
+	adc	-8(vp), %r9
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	lea	16(rp), rp
+	jmp	L(addx)
+
+
+	ALIGN(16)
+L(1m4):
+L(lo1):	mov	(mp,nneg,8), %rax
+	xor	%r9, %r9
+	xor	R32(%rbx), R32(%rbx)
+	mul	q0
+	mov	%rax, %r9
+	mov	8(mp,nneg,8), %rax
+	mov	24(up,nneg,8), %r15
+	mov	%rdx, %r14
+	mov	$0, R32(%r10)		C xor?
+	mul	q0
+	add	16(up,nneg,8), %r9
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	16(mp,nneg,8), %rax
+	mul	q0
+	lea	1(nneg), i
+	add	%r14, %r15
+	imul	u0inv, %r15
+	jmp	 L(e1)
+
+	ALIGN(16)
+L(li1):	add	%r10, (up,i,8)
+	adc	%rax, %r9
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r14
+	xor	R32(%r10), R32(%r10)
+	mul	q0
+	add	%r9, 8(up,i,8)
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	8(mp,i,8), %rax
+	mul	q0
+L(e1):	add	%r14, 16(up,i,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+	mov	16(mp,i,8), %rax
+	mul	q0
+	add	%rbx, 24(up,i,8)
+	mov	$0, R32(%r14)		C zero
+	mov	%r14, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(mp,i,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	mul	q0
+	add	$4, i
+	js	 L(li1)
+
+L(le1):	add	%r10, (up)
+	adc	%rax, %r9
+	adc	%r14, %rdx
+	add	%r9, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, 16(up,nneg,8)	C up[0]
+	add	$8, up
+	mov	%r15, q0
+	dec	n
+	jnz	L(lo1)
+
+	mov	nneg, n
+	sar	$2, n
+	lea	24(up,nneg,8), up
+	lea	(up,nneg,8), vp
+
+	mov	-8(up), %r8
+	add	-8(vp), %r8
+	mov	%r8, (rp)
+	lea	8(rp), rp
+	jmp	L(addx)
+
+
+	ALIGN(16)
+L(0):
+L(0m4):
+L(lo0):	mov	(mp,nneg,8), %rax
+	mov	nneg, i
+	mul	q0
+	xor	R32(%r10), R32(%r10)
+	mov	%rax, %r14
+	mov	%rdx, %rbx
+	mov	8(mp,nneg,8), %rax
+	mov	24(up,nneg,8), %r15
+	mul	q0
+	add	16(up,nneg,8), %r14
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+	add	%rbx, %r15
+	imul	u0inv, %r15
+	jmp	L(e0)
+
+	ALIGN(16)
+L(li0):	add	%r10, (up,i,8)
+	adc	%rax, %r9
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r14
+	xor	R32(%r10), R32(%r10)
+	mul	q0
+	add	%r9, 8(up,i,8)
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	8(mp,i,8), %rax
+	mul	q0
+	add	%r14, 16(up,i,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+L(e0):	mov	16(mp,i,8), %rax
+	mul	q0
+	add	%rbx, 24(up,i,8)
+	mov	$0, R32(%r14)		C zero
+	mov	%r14, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(mp,i,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	mul	q0
+	add	$4, i
+	js	 L(li0)
+
+L(le0):	add	%r10, (up)
+	adc	%rax, %r9
+	adc	%r14, %rdx
+	add	%r9, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, 16(up,nneg,8)	C up[0]
+	add	$8, up
+	mov	%r15, q0
+	dec	n
+	jnz	L(lo0)
+
+	mov	nneg, n
+	sar	$2, n
+	clc
+	lea	16(up,nneg,8), up
+	lea	(up,nneg,8), vp
+	jmp	L(addy)
+
+
+	ALIGN(16)
+L(3m4):
+L(lo3):	mov	(mp,nneg,8), %rax
+	mul	q0
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	mov	8(mp,nneg,8), %rax
+	mov	24(up,nneg,8), %r15
+	mul	q0
+	add	16(up,nneg,8), %rbx	C result is zero, might carry
+	mov	$0, R32(%rbx)		C zero
+	mov	%rbx, %r14		C zero
+	adc	%rax, %r10
+	mov	16(mp,nneg,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	add	%r10, %r15
+	mul	q0
+	lea	3(nneg), i
+	imul	u0inv, %r15
+C	jmp	L(li3)
+
+	ALIGN(16)
+L(li3):	add	%r10, (up,i,8)
+	adc	%rax, %r9
+	mov	(mp,i,8), %rax
+	adc	%rdx, %r14
+	xor	R32(%r10), R32(%r10)
+	mul	q0
+	add	%r9, 8(up,i,8)
+	adc	%rax, %r14
+	adc	%rdx, %rbx
+	mov	8(mp,i,8), %rax
+	mul	q0
+	add	%r14, 16(up,i,8)
+	adc	%rax, %rbx
+	adc	%rdx, %r10
+	mov	16(mp,i,8), %rax
+	mul	q0
+	add	%rbx, 24(up,i,8)
+	mov	$0, R32(%r14)		C zero
+	mov	%r14, %rbx		C zero
+	adc	%rax, %r10
+	mov	24(mp,i,8), %rax
+	mov	%r14, %r9		C zero
+	adc	%rdx, %r9
+	mul	q0
+	add	$4, i
+	js	 L(li3)
+
+L(le3):	add	%r10, (up)
+	adc	%rax, %r9
+	adc	%r14, %rdx
+	add	%r9, 8(up)
+	adc	$0, %rdx
+	mov	%rdx, 16(up,nneg,8)	C up[0]
+	mov	%r15, q0
+	lea	8(up), up
+	dec	n
+	jnz	L(lo3)
+
+
+C ==== Addition code ====
+	mov	nneg, n
+	sar	$2, n
+	lea	40(up,nneg,8), up
+	lea	(up,nneg,8), vp
+
+	mov	-24(up), %r8
+	mov	-16(up), %r9
+	mov	-8(up), %r10
+	add	-24(vp), %r8
+	adc	-16(vp), %r9
+	adc	-8(vp), %r10
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	lea	24(rp), rp
+
+L(addx):inc	n
+	jz	L(ad3)
+
+L(addy):mov	(up), %r8
+	mov	8(up), %r9
+	inc	n
+	jmp	L(mid)
+
+C	ALIGN(16)
+L(al3):	adc	(vp), %r8
+	adc	8(vp), %r9
+	adc	16(vp), %r10
+	adc	24(vp), %r11
+	mov	%r8, (rp)
+	lea	32(up), up
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	inc	n
+	mov	%r11, 24(rp)
+	lea	32(vp), vp
+	mov	(up), %r8
+	mov	8(up), %r9
+	lea	32(rp), rp
+L(mid):	mov	16(up), %r10
+	mov	24(up), %r11
+	jnz	L(al3)
+
+L(ae3):	adc	(vp), %r8
+	adc	8(vp), %r9
+	adc	16(vp), %r10
+	adc	24(vp), %r11
+	mov	%r8, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%r11, 24(rp)
+
+L(ad3):	mov	R32(n), R32(%rax)	C zero
+	adc	R32(%rax), R32(%rax)
+
+L(ret):	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	FUNC_EXIT()
+	ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm
new file mode 100644
index 0000000..60cf945
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm
@@ -0,0 +1,807 @@
+dnl  AMD64 mpn_sqr_basecase.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C   * There is a major stupidity in that we call mpn_mul_1 initially, for a
+C     large trip count.  Instead, we should follow the generic/sqr_basecase.c
+C     code which uses addmul_2s from the start, conditionally leaving a 1x1
+C     multiply to the end.  (In assembly code, one would stop invoking
+C     addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.)
+C   * Another stupidity is in the sqr_diag_addlsh1 code.  It does not need to
+C     save/restore carry, instead it can propagate into the high product word.
+C   * Align more labels, should shave off a few cycles.
+C   * We can safely use 32-bit size operations, since operands with (2^32)
+C     limbs will lead to non-termination in practice.
+C   * The jump table could probably be optimized, at least for non-pic.
+C   * The special code for n <= 4 was quickly written.  It is probably too
+C     large and unnecessarily slow.
+C   * Consider combining small cases code so that the n=k-1 code jumps into the
+C     middle of the n=k code.
+C   * Avoid saving registers for small cases code.
+C   * Needed variables:
+C    n   r11  input size
+C    i   r8   work left, initially n
+C    j   r9   inner loop count
+C        r15  unused
+C    v0  r13
+C    v1  r14
+C    rp  rdi
+C    up  rsi
+C    w0  rbx
+C    w1  rcx
+C    w2  rbp
+C    w3  r10
+C    tp  r12
+C    lo  rax
+C    hi  rdx
+C        rsp
+
+C INPUT PARAMETERS
+define(`rp',	  `%rdi')
+define(`up',	  `%rsi')
+define(`n_param', `%rdx')
+
+define(`n',	`%r11')
+define(`tp',	`%r12')
+define(`i',	`%r8')
+define(`j',	`%r9')
+define(`v0',	`%r13')
+define(`v1',	`%r14')
+define(`w0',	`%rbx')
+define(`w1',	`%rcx')
+define(`w2',	`%rbp')
+define(`w3',	`%r10')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+	FUNC_ENTRY(3)
+	mov	R32(n_param), R32(%rcx)
+	mov	R32(n_param), R32(n)		C free original n register (rdx)
+
+	add	$-40, %rsp
+
+	and	$3, R32(%rcx)
+	cmp	$4, R32(n_param)
+	lea	4(%rcx), %r8
+
+	mov	%rbx, 32(%rsp)
+	mov	%rbp, 24(%rsp)
+	mov	%r12, 16(%rsp)
+	mov	%r13, 8(%rsp)
+	mov	%r14, (%rsp)
+
+	cmovg	%r8, %rcx
+
+	lea	L(tab)(%rip), %rax
+ifdef(`PIC',
+`	movslq	(%rax,%rcx,4), %r10
+	add	%r10, %rax
+	jmp	*%rax
+',`
+	jmp	*(%rax,%rcx,8)
+')
+	JUMPTABSECT
+	ALIGN(8)
+L(tab):	JMPENT(	L(4), L(tab))
+	JMPENT(	L(1), L(tab))
+	JMPENT(	L(2), L(tab))
+	JMPENT(	L(3), L(tab))
+	JMPENT(	L(0m4), L(tab))
+	JMPENT(	L(1m4), L(tab))
+	JMPENT(	L(2m4), L(tab))
+	JMPENT(	L(3m4), L(tab))
+	TEXT
+
+L(1):	mov	(up), %rax
+	mul	%rax
+	add	$40, %rsp
+	mov	%rax, (rp)
+	mov	%rdx, 8(rp)
+	FUNC_EXIT()
+	ret
+
+L(2):	mov	(up), %rax
+	mov	%rax, %r8
+	mul	%rax
+	mov	8(up), %r11
+	mov	%rax, (rp)
+	mov	%r11, %rax
+	mov	%rdx, %r9
+	mul	%rax
+	add	$40, %rsp
+	mov	%rax, %r10
+	mov	%r11, %rax
+	mov	%rdx, %r11
+	mul	%r8
+	xor	%r8, %r8
+	add	%rax, %r9
+	adc	%rdx, %r10
+	adc	%r8, %r11
+	add	%rax, %r9
+	mov	%r9, 8(rp)
+	adc	%rdx, %r10
+	mov	%r10, 16(rp)
+	adc	%r8, %r11
+	mov	%r11, 24(rp)
+	FUNC_EXIT()
+	ret
+
+L(3):	mov	(up), %rax
+	mov	%rax, %r10
+	mul	%rax
+	mov	8(up), %r11
+	mov	%rax, (rp)
+	mov	%r11, %rax
+	mov	%rdx, 8(rp)
+	mul	%rax
+	mov	16(up), %rcx
+	mov	%rax, 16(rp)
+	mov	%rcx, %rax
+	mov	%rdx, 24(rp)
+	mul	%rax
+	mov	%rax, 32(rp)
+	mov	%rdx, 40(rp)
+
+	mov	%r11, %rax
+	mul	%r10
+	mov	%rax, %r8
+	mov	%rcx, %rax
+	mov	%rdx, %r9
+	mul	%r10
+	xor	%r10, %r10
+	add	%rax, %r9
+	mov	%r11, %rax
+	mov	%r10, %r11
+	adc	%rdx, %r10
+
+	mul	%rcx
+	add	$40, %rsp
+	add	%rax, %r10
+	adc	%r11, %rdx
+	add	%r8, %r8
+	adc	%r9, %r9
+	adc	%r10, %r10
+	adc	%rdx, %rdx
+	adc	%r11, %r11
+	add	%r8, 8(rp)
+	adc	%r9, 16(rp)
+	adc	%r10, 24(rp)
+	adc	%rdx, 32(rp)
+	adc	%r11, 40(rp)
+	FUNC_EXIT()
+	ret
+
+L(4):	mov	(up), %rax
+	mov	%rax, %r11
+	mul	%rax
+	mov	8(up), %rbx
+	mov	%rax, (rp)
+	mov	%rbx, %rax
+	mov	%rdx, 8(rp)
+	mul	%rax
+	mov	%rax, 16(rp)
+	mov	%rdx, 24(rp)
+	mov	16(up), %rax
+	mul	%rax
+	mov	%rax, 32(rp)
+	mov	%rdx, 40(rp)
+	mov	24(up), %rax
+	mul	%rax
+	mov	%rax, 48(rp)
+	mov	%rbx, %rax
+	mov	%rdx, 56(rp)
+
+	mul	%r11
+	add	$32, %rsp
+	mov	%rax, %r8
+	mov	%rdx, %r9
+	mov	16(up), %rax
+	mul	%r11
+	xor	%r10, %r10
+	add	%rax, %r9
+	adc	%rdx, %r10
+	mov	24(up), %rax
+	mul	%r11
+	xor	%r11, %r11
+	add	%rax, %r10
+	adc	%rdx, %r11
+	mov	16(up), %rax
+	mul	%rbx
+	xor	%rcx, %rcx
+	add	%rax, %r10
+	adc	%rdx, %r11
+	adc	$0, %rcx
+	mov	24(up), %rax
+	mul	%rbx
+	pop	%rbx
+	add	%rax, %r11
+	adc	%rdx, %rcx
+	mov	16(up), %rdx
+	mov	24(up), %rax
+	mul	%rdx
+	add	%rax, %rcx
+	adc	$0, %rdx
+
+	add	%r8, %r8
+	adc	%r9, %r9
+	adc	%r10, %r10
+	adc	%r11, %r11
+	adc	%rcx, %rcx
+	mov	$0, R32(%rax)
+	adc	%rdx, %rdx
+
+	adc	%rax, %rax
+	add	%r8, 8(rp)
+	adc	%r9, 16(rp)
+	adc	%r10, 24(rp)
+	adc	%r11, 32(rp)
+	adc	%rcx, 40(rp)
+	adc	%rdx, 48(rp)
+	adc	%rax, 56(rp)
+	FUNC_EXIT()
+	ret
+
+
+L(0m4):
+	lea	-16(rp,n,8), tp		C point tp in middle of result operand
+	mov	(up), v0
+	mov	8(up), %rax
+	lea	(up,n,8), up		C point up at end of input operand
+
+	lea	-4(n), i
+C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
+	xor	R32(j), R32(j)
+	sub	n, j
+
+	mul	v0
+	xor	R32(w2), R32(w2)
+	mov	%rax, w0
+	mov	16(up,j,8), %rax
+	mov	%rdx, w3
+	jmp	L(L3)
+
+	ALIGN(16)
+L(mul_1_m3_top):
+	add	%rax, w2
+	mov	w3, (tp,j,8)
+	mov	(up,j,8), %rax
+	adc	%rdx, w1
+	xor	R32(w0), R32(w0)
+	mul	v0
+	xor	R32(w3), R32(w3)
+	mov	w2, 8(tp,j,8)
+	add	%rax, w1
+	adc	%rdx, w0
+	mov	8(up,j,8), %rax
+	mov	w1, 16(tp,j,8)
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	16(up,j,8), %rax
+	adc	%rdx, w3
+L(L3):	xor	R32(w1), R32(w1)
+	mul	v0
+	add	%rax, w3
+	mov	24(up,j,8), %rax
+	adc	%rdx, w2
+	mov	w0, 24(tp,j,8)
+	mul	v0
+	add	$4, j
+	js	L(mul_1_m3_top)
+
+	add	%rax, w2
+	mov	w3, (tp)
+	adc	%rdx, w1
+	mov	w2, 8(tp)
+	mov	w1, 16(tp)
+
+	lea	eval(2*8)(tp), tp	C tp += 2
+	lea	-8(up), up
+	jmp	L(dowhile)
+
+
+L(1m4):
+	lea	8(rp,n,8), tp		C point tp in middle of result operand
+	mov	(up), v0		C u0
+	mov	8(up), %rax		C u1
+	lea	8(up,n,8), up		C point up at end of input operand
+
+	lea	-3(n), i
+C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
+	lea	-3(n), j
+	neg	j
+
+	mov	%rax, v1		C u1
+	mul	v0			C u0 * u1
+	mov	%rdx, w1
+	xor	R32(w2), R32(w2)
+	mov	%rax, 8(rp)
+	jmp	L(m0)
+
+	ALIGN(16)
+L(mul_2_m0_top):
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	-24(up,j,8), %rax
+	mov	$0, R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1			C v1 * u0
+	add	%rax, w1
+	mov	w0, -24(tp,j,8)
+	adc	%rdx, w2
+L(m0):	mov	-16(up,j,8), %rax	C u2, u6 ...
+	mul	v0			C u0 * u2
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	adc	$0, R32(w3)
+	mov	$0, R32(w0)
+	mov	w1, -16(tp,j,8)
+	mul	v1
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	add	%rax, w3
+	mov	w2, -8(tp,j,8)
+	adc	%rdx, w0
+L(m2x):	mov	(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	add	$4, j
+	mov	-32(up,j,8), %rax
+	mov	w3, -32(tp,j,8)
+	js	L(mul_2_m0_top)
+
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, -8(tp)
+	mov	w1, (tp)
+
+	lea	-16(up), up
+	lea	eval(3*8-24)(tp), tp	C tp += 3
+	jmp	L(dowhile_end)
+
+
+L(2m4):
+	lea	-16(rp,n,8), tp		C point tp in middle of result operand
+	mov	(up), v0
+	mov	8(up), %rax
+	lea	(up,n,8), up		C point up at end of input operand
+
+	lea	-4(n), i
+C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
+	lea	-2(n), j
+	neg	j
+
+	mul	v0
+	mov	%rax, w2
+	mov	(up,j,8), %rax
+	mov	%rdx, w1
+	jmp	L(L1)
+
+	ALIGN(16)
+L(mul_1_m1_top):
+	add	%rax, w2
+	mov	w3, (tp,j,8)
+	mov	(up,j,8), %rax
+	adc	%rdx, w1
+L(L1):	xor	R32(w0), R32(w0)
+	mul	v0
+	xor	R32(w3), R32(w3)
+	mov	w2, 8(tp,j,8)
+	add	%rax, w1
+	adc	%rdx, w0
+	mov	8(up,j,8), %rax
+	mov	w1, 16(tp,j,8)
+	xor	R32(w2), R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	16(up,j,8), %rax
+	adc	%rdx, w3
+	xor	R32(w1), R32(w1)
+	mul	v0
+	add	%rax, w3
+	mov	24(up,j,8), %rax
+	adc	%rdx, w2
+	mov	w0, 24(tp,j,8)
+	mul	v0
+	add	$4, j
+	js	L(mul_1_m1_top)
+
+	add	%rax, w2
+	mov	w3, (tp)
+	adc	%rdx, w1
+	mov	w2, 8(tp)
+	mov	w1, 16(tp)
+
+	lea	eval(2*8)(tp), tp	C tp += 2
+	lea	-8(up), up
+	jmp	L(dowhile_mid)
+
+
+L(3m4):
+	lea	8(rp,n,8), tp		C point tp in middle of result operand
+	mov	(up), v0		C u0
+	mov	8(up), %rax		C u1
+	lea	8(up,n,8), up		C point up at end of input operand
+
+	lea	-5(n), i
+C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
+	lea	-1(n), j
+	neg	j
+
+	mov	%rax, v1		C u1
+	mul	v0			C u0 * u1
+	mov	%rdx, w3
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	mov	%rax, 8(rp)
+	jmp	L(m2)
+
+	ALIGN(16)
+L(mul_2_m2_top):
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	-24(up,j,8), %rax
+	mov	$0, R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	-24(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1			C v1 * u0
+	add	%rax, w1
+	mov	w0, -24(tp,j,8)
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	mul	v0
+	mov	$0, R32(w3)
+	add	%rax, w1
+	adc	%rdx, w2
+	mov	-16(up,j,8), %rax
+	adc	$0, R32(w3)
+	mov	$0, R32(w0)
+	mov	w1, -16(tp,j,8)
+	mul	v1
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mul	v0
+	add	%rax, w2
+	mov	-8(up,j,8), %rax
+	adc	%rdx, w3
+	adc	$0, R32(w0)
+	mul	v1
+	add	%rax, w3
+	mov	w2, -8(tp,j,8)
+	adc	%rdx, w0
+L(m2):	mov	(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	add	$4, j
+	mov	-32(up,j,8), %rax
+	mov	w3, -32(tp,j,8)
+	js	L(mul_2_m2_top)
+
+	mul	v1
+	add	%rax, w0
+	adc	%rdx, w1
+	mov	w0, -8(tp)
+	mov	w1, (tp)
+
+	lea	-16(up), up
+	jmp	L(dowhile_mid)
+
+L(dowhile):
+C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
+	lea	4(i), j
+	neg	j
+
+	mov	16(up,j,8), v0
+	mov	24(up,j,8), v1
+	mov	24(up,j,8), %rax
+	mul	v0
+	xor	R32(w3), R32(w3)
+	add	%rax, 24(tp,j,8)
+	adc	%rdx, w3
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	jmp	L(am2)
+
+	ALIGN(16)
+L(addmul_2_m2_top):
+	add	w3, (tp,j,8)
+	adc	%rax, w0
+	mov	8(up,j,8), %rax
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	8(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1				C v1 * u0
+	add	w0, 8(tp,j,8)
+	adc	%rax, w1
+	adc	%rdx, w2
+	mov	16(up,j,8), %rax
+	mov	$0, R32(w3)
+	mul	v0				C v0 * u1
+	add	%rax, w1
+	mov	16(up,j,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mul	v1				C v1 * u1
+	add	w1, 16(tp,j,8)
+	adc	%rax, w2
+	mov	24(up,j,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	mov	$0, R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mov	24(up,j,8), %rax
+	adc	$0, R32(w0)
+	mul	v1
+	add	w2, 24(tp,j,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+L(am2):	mov	32(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	32(up,j,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	$4, j
+	js	L(addmul_2_m2_top)
+
+	add	w3, (tp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(tp)
+	mov	w1, 16(tp)
+
+	lea	eval(2*8)(tp), tp	C tp += 2
+
+	add	$-2, R32(i)		C i -= 2
+
+L(dowhile_mid):
+C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
+	lea	2(i), j
+	neg	j
+
+	mov	(up,j,8), v0
+	mov	8(up,j,8), v1
+	mov	8(up,j,8), %rax
+	mul	v0
+	xor	R32(w1), R32(w1)
+	add	%rax, 8(tp,j,8)
+	adc	%rdx, w1
+	xor	R32(w2), R32(w2)
+	jmp	L(20)
+
+	ALIGN(16)
+L(addmul_2_m0_top):
+	add	w3, (tp,j,8)
+	adc	%rax, w0
+	mov	8(up,j,8), %rax
+	adc	%rdx, w1
+	mov	$0, R32(w2)
+	mul	v0
+	add	%rax, w0
+	mov	8(up,j,8), %rax
+	adc	%rdx, w1
+	adc	$0, R32(w2)
+	mul	v1				C v1 * u0
+	add	w0, 8(tp,j,8)
+	adc	%rax, w1
+	adc	%rdx, w2
+L(20):	mov	16(up,j,8), %rax
+	mov	$0, R32(w3)
+	mul	v0				C v0 * u1
+	add	%rax, w1
+	mov	16(up,j,8), %rax
+	adc	%rdx, w2
+	adc	$0, R32(w3)
+	mul	v1				C v1 * u1
+	add	w1, 16(tp,j,8)
+	adc	%rax, w2
+	mov	24(up,j,8), %rax
+	adc	%rdx, w3
+	mul	v0
+	mov	$0, R32(w0)
+	add	%rax, w2
+	adc	%rdx, w3
+	mov	$0, R32(w1)
+	mov	24(up,j,8), %rax
+	adc	$0, R32(w0)
+	mul	v1
+	add	w2, 24(tp,j,8)
+	adc	%rax, w3
+	adc	%rdx, w0
+	mov	32(up,j,8), %rax
+	mul	v0
+	add	%rax, w3
+	mov	32(up,j,8), %rax
+	adc	%rdx, w0
+	adc	$0, R32(w1)
+	mul	v1
+	add	$4, j
+	js	L(addmul_2_m0_top)
+
+	add	w3, (tp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(tp)
+	mov	w1, 16(tp)
+
+	lea	eval(2*8)(tp), tp	C tp += 2
+L(dowhile_end):
+
+	add	$-2, R32(i)		C i -= 2
+	jne	L(dowhile)
+
+C Function mpn_addmul_2s_2
+	mov	-16(up), v0
+	mov	-8(up), v1
+	mov	-8(up), %rax
+	mul	v0
+	xor	R32(w3), R32(w3)
+	add	%rax, -8(tp)
+	adc	%rdx, w3
+	xor	R32(w0), R32(w0)
+	xor	R32(w1), R32(w1)
+	mov	(up), %rax
+	mul	v0
+	add	%rax, w3
+	mov	(up), %rax
+	adc	%rdx, w0
+	mul	v1
+	add	w3, (tp)
+	adc	%rax, w0
+	adc	%rdx, w1
+	mov	w0, 8(tp)
+	mov	w1, 16(tp)
+
+C Function mpn_sqr_diag_addlsh1
+	lea	-4(n,n), j
+
+	mov	8(rp), %r11
+	lea	-8(up), up
+	lea	(rp,j,8), rp
+	neg	j
+	mov	(up,j,4), %rax
+	mul	%rax
+	test	$2, R8(j)
+	jnz	L(odd)
+
+L(evn):	add	%r11, %r11
+	sbb	R32(%rbx), R32(%rbx)		C save CF
+	add	%rdx, %r11
+	mov	%rax, (rp,j,8)
+	jmp	L(d0)
+
+L(odd):	add	%r11, %r11
+	sbb	R32(%rbp), R32(%rbp)		C save CF
+	add	%rdx, %r11
+	mov	%rax, (rp,j,8)
+	lea	-2(j), j
+	jmp	L(d1)
+
+	ALIGN(16)
+L(top):	mov	(up,j,4), %rax
+	mul	%rax
+	add	R32(%rbp), R32(%rbp)		C restore carry
+	adc	%rax, %r10
+	adc	%rdx, %r11
+	mov	%r10, (rp,j,8)
+L(d0):	mov	%r11, 8(rp,j,8)
+	mov	16(rp,j,8), %r10
+	adc	%r10, %r10
+	mov	24(rp,j,8), %r11
+	adc	%r11, %r11
+	nop
+	sbb	R32(%rbp), R32(%rbp)		C save CF
+	mov	8(up,j,4), %rax
+	mul	%rax
+	add	R32(%rbx), R32(%rbx)		C restore carry
+	adc	%rax, %r10
+	adc	%rdx, %r11
+	mov	%r10, 16(rp,j,8)
+L(d1):	mov	%r11, 24(rp,j,8)
+	mov	32(rp,j,8), %r10
+	adc	%r10, %r10
+	mov	40(rp,j,8), %r11
+	adc	%r11, %r11
+	sbb	R32(%rbx), R32(%rbx)		C save CF
+	add	$4, j
+	js	L(top)
+
+	mov	(up), %rax
+	mul	%rax
+	add	R32(%rbp), R32(%rbp)		C restore carry
+	adc	%rax, %r10
+	adc	%rdx, %r11
+	mov	%r10, (rp)
+	mov	%r11, 8(rp)
+	mov	16(rp), %r10
+	adc	%r10, %r10
+	sbb	R32(%rbp), R32(%rbp)		C save CF
+	neg	R32(%rbp)
+	mov	8(up), %rax
+	mul	%rax
+	add	R32(%rbx), R32(%rbx)		C restore carry
+	adc	%rax, %r10
+	adc	%rbp, %rdx
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+EPILOGUE()
-- 
cgit v1.2.3