Basic constant folding implementation

author: Thomas Voss <mail@thomasvoss.com> 2024-06-21 23:36:36 +0200
committer: Thomas Voss <mail@thomasvoss.com> 2024-06-21 23:42:26 +0200
commit: a89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
tree: b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/pentium
parent: 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)
25 files changed, 5560 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/README b/vendor/gmp-6.3.0/mpn/x86/pentium/README
new file mode 100644
index 0000000..305936b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/README
@@ -0,0 +1,181 @@
+Copyright 1996, 1999-2001, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+                   INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors.  The mmx subdirectory has additional code for Pentium with MMX
+(P55).
+
+
+STATUS
+
+                                cycles/limb
+
+	mpn_add_n/sub_n            2.375
+
+	mpn_mul_1                 12.0
+	mpn_add/submul_1          14.0
+
+	mpn_mul_basecase          14.2 cycles/crossproduct (approx)
+
+	mpn_sqr_basecase           8 cycles/crossproduct (approx)
+                                   or 15.5 cycles/triangleproduct (approx)
+
+	mpn_l/rshift               5.375 normal (6.0 on P54)
+				   1.875 special shift by 1 bit
+
+	mpn_divrem_1              44.0
+	mpn_mod_1                 28.0
+	mpn_divexact_by3          15.0
+
+	mpn_copyi/copyd            1.0
+
+Pentium MMX gets the following improvements
+
+	mpn_l/rshift               1.75
+
+	mpn_mul_1                 12.0 normal, 7.0 for 16-bit multiplier
+
+
+mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb.  Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5
+cycles/limb.
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should.  Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run as 9.
+
+
+
+P55 MMX AND X87
+
+The cost of switching between MMX and x87 floating point on P55 is about 100
+cycles (fld1/por/emms for instance).  In order to avoid that the two aren't
+mixed and currently that means using MMX and not x87.
+
+MMX offers a big speedup for lshift and rshift, and a nice speedup for
+16-bit multipliers in mpn_mul_1.  If fast code using x87 is found then
+perhaps the preference for MMX will be reversed.
+
+
+
+
+P54 SHLDL
+
+mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically.  The P55 runs them at the expected speed.
+
+It seems that on P54 a shldl or shrdl allows pairing in one following cycle,
+but not two.  For example, back to back repetitions of the following
+
+	shldl(	%cl, %eax, %ebx)
+	xorl	%edx, %edx
+	xorl	%esi, %esi
+
+run at 5 cycles, as expected, but repetitions of the following run at 7
+cycles, whereas 6 would be expected (and is achieved on P55),
+
+	shldl(	%cl, %eax, %ebx)
+	xorl	%edx, %edx
+	xorl	%esi, %esi
+	xorl	%edi, %edi
+	xorl	%ebp, %ebp
+
+Three xorls run at 7 cycles too, so it doesn't seem to be just that pairing
+inhibited is only in the second following cycle (or something like that).
+
+Avoiding this problem would bring P54 shifts down from 6.0 c/l to 5.5 with a
+pattern of shift, 2 loads, shift, 2 stores, shift, etc.  A start has been
+made on something like that, but it's not yet complete.
+
+
+
+
+OTHER NOTES
+
+Prefetching Destinations
+
+    Pentium doesn't allocate cache lines on writes, unlike most other modern
+    processors.  Since the functions in the mpn class do array writes, we
+    have to handle allocating the destination cache lines by reading a word
+    from it in the loops, to achieve the best performance.
+
+Prefetching Sources
+
+    Prefetching of sources is pointless since there's no out-of-order loads.
+    Any load instruction blocks until the line is brought to L1, so it may
+    as well be the load that wants the data which blocks.
+
+Data Cache Bank Clashes
+
+    Pairing of memory operations requires that the two issued operations
+    refer to different cache banks (ie. different addresses modulo 32
+    bytes).  The simplest way to ensure this is to read/write two words from
+    the same object.  If we make operations on different objects, they might
+    or might not be to the same cache bank.
+
+PIC %eip Fetching
+
+    A simple call $+5 and popl can be used to get %eip, there's no need to
+    balance calls and returns since P5 doesn't have any return stack branch
+    prediction.
+
+Float Multiplies
+
+    fmul is pairable and can be issued every 2 cycles (with a 4 cycle
+    latency for data ready to use).  This is a lot better than integer mull
+    or imull at 9 cycles non-pairing.  Unfortunately the advantage is
+    quickly eaten away by needing to throw data through memory back to the
+    integer registers to adjust for fild and fist being signed, and to do
+    things like propagating carry bits.
+
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816.  This
+is mostly about P5, the parts about P6 aren't relevant.  Available on-line:
+
+        http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000..01ebfb9
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm
@@ -0,0 +1,203 @@
+dnl  Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 2.375 cycles/limb
+
+
+ifdef(`OPERATION_add_n',`
+	define(M4_inst,        adcl)
+	define(M4_function_n,  mpn_add_n)
+	define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+	define(M4_inst,        sbbl)
+	define(M4_function_n,  mpn_sub_n)
+	define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                          mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C                           mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(endgo)
+
+	pushl	%edx
+FRAME_pushl()
+	movl	PARAM_CARRY,%eax
+	shrl	%eax			C shift bit 0 into carry
+	jmp	L(oop)
+
+L(endgo):
+deflit(`FRAME',16)
+	movl	PARAM_CARRY,%eax
+	shrl	%eax			C shift bit 0 into carry
+	jmp	L(end)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_n)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC1,%esi
+	movl	PARAM_SRC2,%ebp
+	movl	PARAM_SIZE,%ecx
+
+	movl	(%ebp),%ebx
+
+	decl	%ecx
+	movl	%ecx,%edx
+	shrl	$3,%ecx
+	andl	$7,%edx
+	testl	%ecx,%ecx		C zero carry flag
+	jz	L(end)
+	pushl	%edx
+FRAME_pushl()
+
+	ALIGN(8)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	leal	32(%edi),%edi
+
+L(1):	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	8(%ebp),%ebx
+	movl	%eax,-32(%edi)
+	movl	%edx,-28(%edi)
+
+L(2):	movl	8(%esi),%eax
+	movl	12(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	12(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	16(%ebp),%ebx
+	movl	%eax,-24(%edi)
+	movl	%edx,-20(%edi)
+
+L(3):	movl	16(%esi),%eax
+	movl	20(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	20(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	24(%ebp),%ebx
+	movl	%eax,-16(%edi)
+	movl	%edx,-12(%edi)
+
+L(4):	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	M4_inst	%ebx,%eax
+	movl	28(%ebp),%ebx
+	M4_inst	%ebx,%edx
+	movl	32(%ebp),%ebx
+	movl	%eax,-8(%edi)
+	movl	%edx,-4(%edi)
+
+	leal	32(%esi),%esi
+	leal	32(%ebp),%ebp
+	decl	%ecx
+	jnz	L(oop)
+
+	popl	%edx
+FRAME_popl()
+L(end):
+	decl	%edx			C test %edx w/o clobbering carry
+	js	L(end2)
+	incl	%edx
+L(oop2):
+	leal	4(%edi),%edi
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	4(%ebp),%ebx
+	movl	%eax,-4(%edi)
+	leal	4(%esi),%esi
+	leal	4(%ebp),%ebp
+	decl	%edx
+	jnz	L(oop2)
+L(end2):
+	movl	(%esi),%eax
+	M4_inst	%ebx,%eax
+	movl	%eax,(%edi)
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000..d83cc45
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm
@@ -0,0 +1,144 @@
+dnl  Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+ifdef(`OPERATION_addmul_1', `
+      define(M4_inst,        addl)
+      define(M4_function_1,  mpn_addmul_1)
+      define(M4_function_1c, mpn_addmul_1c)
+
+',`ifdef(`OPERATION_submul_1', `
+      define(M4_inst,        subl)
+      define(M4_function_1,  mpn_submul_1)
+      define(M4_function_1c, mpn_submul_1c)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                         mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                          mp_limb_t mult, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+
+	ALIGN(8)
+PROLOGUE(M4_function_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+	pushl	%esi		FRAME_pushl()
+
+L(start_1c):
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %eax
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	leal	-1(%eax), %ebx		C size-1
+
+	leal	(%esi,%eax,4), %esi
+	xorl	$-1, %ebx		C -size, and clear carry
+
+	leal	(%edi,%eax,4), %edi
+
+L(top):
+	C eax
+	C ebx	counter, negative
+	C ecx	carry
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp
+
+	adcl	$0, %ecx
+	movl	(%esi,%ebx,4), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%ecx, %eax
+	movl	(%edi,%ebx,4), %ecx
+
+	adcl	$0, %edx
+	M4_inst	%eax, %ecx
+
+	movl	%ecx, (%edi,%ebx,4)
+	incl	%ebx
+
+	movl	%edx, %ecx
+	jnz	L(top)
+
+
+	adcl	$0, %ecx
+	popl	%ebx
+
+	movl	%ecx, %eax
+	popl	%edi
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm
new file mode 100644
index 0000000..c2c4f58
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm
@@ -0,0 +1,266 @@
+dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
+
+dnl  Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         divisor
+C       odd   even
+C P54:  24.5  30.5   cycles/limb
+C P55:  23.0  28.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected.  On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	$-1, %ecx
+	movl	PARAM_DIVISOR, %eax
+
+L(strip_twos):
+	ASSERT(nz, `orl %eax, %eax')
+	shrl	%eax
+	incl	%ecx			C shift count
+
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %edx	C d
+	andl	$127, %eax		C d/2, 7 bits
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %eax
+',`
+	call	L(here)
+L(here):
+	popl	%ebp			C eip
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+	C AGI
+	movl	binvert_limb_table@GOT(%ebp), %ebp
+	C AGI
+	movzbl	(%eax,%ebp), %eax
+')
+',`
+
+dnl non-PIC
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	movl	%eax, %ebp		C inv
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	PARAM_SIZE, %ebx
+
+	movl	%eax, %ebp
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	%edx, PARAM_DIVISOR	C d without twos
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	jmp	L(common)
+EPILOGUE()
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SHIFT, %ecx
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebx
+	movl	PARAM_INVERSE, %eax
+
+L(common):
+	pushl	%esi		FRAME_pushl()
+	push	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+	movl	%eax, VAR_INVERSE
+
+	leal	(%esi,%ebx,4), %esi	C src end
+	leal	(%edi,%ebx,4), %edi	C dst end
+
+	negl	%ebx			C -size
+
+	xorl	%ebp, %ebp		C initial carry bit
+
+	orl	%ecx, %ecx		C shift
+	movl	(%esi,%ebx,4), %eax	C src low limb
+	jz	L(odd_entry)
+
+	xorl	%edx, %edx		C initial carry limb (for even, if one)
+	incl	%ebx
+	jz	L(one)
+
+	movl	(%esi,%ebx,4), %edx	C src second limb (for even)
+	shrdl(	%cl, %edx, %eax)
+
+	jmp	L(even_entry)
+
+
+	ALIGN(8)
+L(odd_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ebp, %edx
+
+	subl	%edx, %eax
+
+	sbbl	%ebp, %ebp
+
+L(odd_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	twos
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	subl	%ebp, %edx		C carry bit
+	movl	-4(%esi,%ebx,4), %eax	C src limb
+
+	movl	(%esi,%ebx,4), %ebp	C and one above it
+
+	shrdl(	%cl, %ebp, %eax)
+
+	subl	%edx, %eax		C carry limb
+
+	sbbl	%ebp, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+
+	jnz	L(even_top)
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax		C src high limb
+	subl	%ebp, %edx
+
+L(one):
+	shrl	%cl, %eax
+
+	subl	%edx, %eax		C no carry if division is exact
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)		C dst high limb
+	nop				C protect against cache bank clash
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm
new file mode 100644
index 0000000..b080545
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm
@@ -0,0 +1,181 @@
+dnl  Intel Pentium mpn_com -- mpn ones complement.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb
+
+
+NAILS_SUPPORT(0-31)
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C This code is similar to mpn_copyi, basically there's just some "xorl
+C $GMP_NUMB_MASK"s inserted.
+C
+C Alternatives:
+C
+C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst
+C are the same alignment mod 8, but it doesn't seem worth the trouble for
+C just that case (there'd need to be some plain integer available too for
+C the unaligned case).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	leal	(%eax,%ecx,4), %eax
+	xorl	$-1, %ecx		C -size-1
+
+	movl	PARAM_DST, %edx
+	addl	$8, %ecx		C -size+7
+
+	jns	L(end)
+
+	movl	(%edx), %esi		C fetch destination cache line
+	nop
+
+L(top):
+	C eax	&src[size]
+	C ebx
+	C ecx	counter, limbs, negative
+	C edx	dst, incrementing
+	C esi	scratch
+	C edi	scratch
+	C ebp
+
+	movl	28(%edx), %esi		C destination prefetch
+	addl	$32, %edx
+
+	movl	-28(%eax,%ecx,4), %esi
+	movl	-24(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -32(%edx)
+	movl	%edi, -28(%edx)
+
+	movl	-20(%eax,%ecx,4), %esi
+	movl	-16(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -24(%edx)
+	movl	%edi, -20(%edx)
+
+	movl	-12(%eax,%ecx,4), %esi
+	movl	-8(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -16(%edx)
+	movl	%edi, -12(%edx)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, -8(%edx)
+	movl	%edi, -4(%edx)
+
+	addl	$8, %ecx
+	js	L(top)
+
+
+L(end):
+	C eax	&src[size]
+	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
+	C edx	dst, next location to store
+
+	subl	$4, %ecx
+	nop
+
+	jns	L(no4)
+
+	movl	-12(%eax,%ecx,4), %esi
+	movl	-8(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, (%edx)
+	movl	%edi, 4(%edx)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, 8(%edx)
+	movl	%edi, 12(%edx)
+
+	addl	$16, %edx
+	addl	$4, %ecx
+L(no4):
+
+	subl	$2, %ecx
+	nop
+
+	jns	L(no2)
+
+	movl	-4(%eax,%ecx,4), %esi
+	movl	(%eax,%ecx,4), %edi
+	xorl	$GMP_NUMB_MASK, %esi
+	xorl	$GMP_NUMB_MASK, %edi
+	movl	%esi, (%edx)
+	movl	%edi, 4(%edx)
+
+	addl	$8, %edx
+	addl	$2, %ecx
+L(no2):
+
+	popl	%edi
+	jnz	L(done)
+
+	movl	-4(%eax), %ecx
+
+	xorl	$GMP_NUMB_MASK, %ecx
+	popl	%esi
+
+	movl	%ecx, (%edx)
+	ret
+
+L(done):
+	popl	%esi
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm
new file mode 100644
index 0000000..72a543b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm
@@ -0,0 +1,146 @@
+dnl  Intel Pentium mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C See comments in copyi.asm.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+	movl	PARAM_SRC, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	leal	-4(%eax,%ecx,4), %eax		C &src[size-1]
+	movl	PARAM_DST, %edx
+
+	subl	$7, %ecx			C size-7
+	jle	L(end)
+
+	movl	28-4(%edx,%ecx,4), %esi		C prefetch cache, dst[size-1]
+	nop
+
+L(top):
+	C eax	src, decrementing
+	C ebx
+	C ecx	counter, limbs
+	C edx	dst
+	C esi	scratch
+	C edi	scratch
+	C ebp
+
+	movl	28-32(%edx,%ecx,4), %esi	C prefetch dst cache line
+	subl	$8, %ecx
+
+	movl	(%eax), %esi			C read words pairwise
+	movl	-4(%eax), %edi
+	movl	%esi, 56(%edx,%ecx,4)		C store words pairwise
+	movl	%edi, 52(%edx,%ecx,4)
+
+	movl	-8(%eax), %esi
+	movl	-12(%eax), %edi
+	movl	%esi, 48(%edx,%ecx,4)
+	movl	%edi, 44(%edx,%ecx,4)
+
+	movl	-16(%eax), %esi
+	movl	-20(%eax), %edi
+	movl	%esi, 40(%edx,%ecx,4)
+	movl	%edi, 36(%edx,%ecx,4)
+
+	movl	-24(%eax), %esi
+	movl	-28(%eax), %edi
+	movl	%esi, 32(%edx,%ecx,4)
+	movl	%edi, 28(%edx,%ecx,4)
+
+	leal	-32(%eax), %eax
+	jg	L(top)
+
+
+L(end):
+	C ecx	-7 to 0, representing respectively 0 to 7 limbs remaining
+	C eax	src end
+	C edx	dst, next location to store
+
+	addl	$4, %ecx
+	jle	L(no4)
+
+	movl	(%eax), %esi
+	movl	-4(%eax), %edi
+	movl	%esi, 8(%edx,%ecx,4)
+	movl	%edi, 4(%edx,%ecx,4)
+
+	movl	-8(%eax), %esi
+	movl	-12(%eax), %edi
+	movl	%esi, (%edx,%ecx,4)
+	movl	%edi, -4(%edx,%ecx,4)
+
+	subl	$16, %eax
+	subl	$4, %ecx
+L(no4):
+
+	addl	$2, %ecx
+	jle	L(no2)
+
+	movl	(%eax), %esi
+	movl	-4(%eax), %edi
+	movl	%esi, (%edx,%ecx,4)
+	movl	%edi, -4(%edx,%ecx,4)
+
+	subl	$8, %eax
+	subl	$2, %ecx
+L(no2):
+
+	jnz	L(done)
+
+	movl	(%eax), %ecx
+	movl	%ecx, (%edx)	C risk of cache bank clash here
+
+L(done):
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm
new file mode 100644
index 0000000..d983d6b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm
@@ -0,0 +1,164 @@
+dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Destination prefetching is done to avoid repeated write-throughs on lines
+C not already in L1.
+C
+C At least one of the src or dst pointer needs to be incremented rather than
+C using indexing, so that there's somewhere to put the loop control without
+C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
+C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
+C subtracts in the finishup code.
+C
+C The block of finishup code is almost as big as the main loop itself, which
+C is unfortunate, but it's faster that way than with say rep movsl, by about
+C 10 cycles for instance on P55.
+C
+C There's nothing to be gained from MMX on P55, since it can do only one
+C movq load (or store) per cycle, so the throughput would be the same as the
+C code here (and even then only if src and dst have the same alignment mod
+C 8).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_DST, %edx
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	leal	(%edx,%ecx,4), %edx	C &dst[size-1]
+	xorl	$-1, %ecx		C -size-1
+
+	movl	PARAM_SRC, %esi
+	addl	$8, %ecx		C -size+7
+
+	jns	L(end)
+
+	movl	-28(%edx,%ecx,4), %eax	C fetch destination cache line, dst[0]
+	nop
+
+L(top):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, limbs, negative
+	C edx	&dst[size-1]
+	C esi	src, incrementing
+	C edi
+	C ebp
+
+	movl	(%edx,%ecx,4), %eax	C fetch destination cache line
+	addl	$8, %ecx
+
+	movl	(%esi), %eax		C read words pairwise
+	movl	4(%esi), %ebx
+	movl	%eax, -60(%edx,%ecx,4)	C store words pairwise
+	movl	%ebx, -56(%edx,%ecx,4)
+
+	movl	8(%esi), %eax
+	movl	12(%esi), %ebx
+	movl	%eax, -52(%edx,%ecx,4)
+	movl	%ebx, -48(%edx,%ecx,4)
+
+	movl	16(%esi), %eax
+	movl	20(%esi), %ebx
+	movl	%eax, -44(%edx,%ecx,4)
+	movl	%ebx, -40(%edx,%ecx,4)
+
+	movl	24(%esi), %eax
+	movl	28(%esi), %ebx
+	movl	%eax, -36(%edx,%ecx,4)
+	movl	%ebx, -32(%edx,%ecx,4)
+
+	leal	32(%esi), %esi
+	js	L(top)
+
+
+L(end):
+	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
+	C esi	src end
+	C edx	dst, next location to store
+
+	subl	$4, %ecx
+	jns	L(no4)
+
+	movl	(%esi), %eax
+	movl	4(%esi), %ebx
+	movl	%eax, -12(%edx,%ecx,4)
+	movl	%ebx, -8(%edx,%ecx,4)
+
+	movl	8(%esi), %eax
+	movl	12(%esi), %ebx
+	movl	%eax, -4(%edx,%ecx,4)
+	movl	%ebx, (%edx,%ecx,4)
+
+	addl	$16, %esi
+	addl	$4, %ecx
+L(no4):
+
+	subl	$2, %ecx
+	jns	L(no2)
+
+	movl	(%esi), %eax
+	movl	4(%esi), %ebx
+	movl	%eax, -4(%edx,%ecx,4)
+	movl	%ebx, (%edx,%ecx,4)
+
+	addl	$8, %esi
+	addl	$2, %ecx
+L(no2):
+
+	jnz	L(done)
+
+	movl	(%esi), %eax
+	movl	%eax, -4(%edx,%ecx,4)	C risk of cache bank clash here
+
+L(done):
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm
new file mode 100644
index 0000000..21b5287
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm
@@ -0,0 +1,264 @@
+dnl  Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         divisor
+C       odd   even
+C P54:  24.5  30.5   cycles/limb
+C P55:  23.0  28.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C Plain divl is used for small sizes, since the inverse takes a while to
+C setup.  Multiplying works out faster for size>=3 when the divisor is odd,
+C or size>=4 when the divisor is even.  Actually on P55 size==2 for odd or
+C size==3 for even are about the same speed for both divl or mul, but the
+C former is used since it will use up less code cache.
+C
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected.  On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+	TEXT
+
+	ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	movl	PARAM_SIZE, %ecx
+
+	pushl	%esi		FRAME_pushl()
+	push	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	andl	$1, %eax
+
+	movl	PARAM_DST, %edi
+	addl	%ecx, %eax	C size if even, size+1 if odd
+
+	cmpl	$4, %eax
+	jae	L(mul_by_inverse)
+
+
+	xorl	%edx, %edx
+L(div_top):
+	movl	-4(%esi,%ecx,4), %eax
+
+	divl	PARAM_DIVISOR
+
+	movl	%eax, -4(%edi,%ecx,4)
+	decl	%ecx
+
+	jnz	L(div_top)
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+
+L(mul_by_inverse):
+	movl	PARAM_DIVISOR, %eax
+	movl	$-1, %ecx
+
+L(strip_twos):
+	ASSERT(nz, `orl %eax, %eax')
+	shrl	%eax
+	incl	%ecx			C shift count
+
+	jnc	L(strip_twos)
+
+	leal	1(%eax,%eax), %edx	C d
+	andl	$127, %eax		C d/2, 7 bits
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+ifdef(`PIC',`dnl
+	LEA(	binvert_limb_table, %ebp)
+	movzbl	(%eax,%ebp), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	movl	%eax, %ebp		C inv
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	PARAM_SIZE, %ebx
+
+	movl	%eax, %ebp
+	addl	%eax, %eax		C 2*inv
+
+	imull	%ebp, %ebp		C inv*inv
+
+	imull	%edx, %ebp		C inv*inv*d
+
+	subl	%ebp, %eax		C inv = 2*inv - inv*inv*d
+	movl	%edx, PARAM_DIVISOR	C d without twos
+
+	leal	(%esi,%ebx,4), %esi	C src end
+	leal	(%edi,%ebx,4), %edi	C dst end
+
+	negl	%ebx			C -size
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	movl	%eax, VAR_INVERSE
+	xorl	%ebp, %ebp		C initial carry bit
+
+	movl	(%esi,%ebx,4), %eax	C src low limb
+	orl	%ecx, %ecx		C shift
+
+	movl	4(%esi,%ebx,4), %edx	C src second limb (for even)
+	jz	L(odd_entry)
+
+	shrdl(	%cl, %edx, %eax)
+
+	incl	%ebx
+	jmp	L(even_entry)
+
+
+	ALIGN(8)
+L(odd_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi,%ebx,4), %eax
+	subl	%ebp, %edx
+
+	subl	%edx, %eax
+
+	sbbl	%ebp, %ebp
+
+L(odd_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, (%edi,%ebx,4)
+
+	incl	%ebx
+	jnz	L(odd_top)
+
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+L(even_top):
+	C eax	scratch
+	C ebx	counter, limbs, negative
+	C ecx	twos
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	carry bit, 0 or -1
+
+	mull	PARAM_DIVISOR
+
+	subl	%ebp, %edx		C carry bit
+	movl	-4(%esi,%ebx,4), %eax	C src limb
+
+	movl	(%esi,%ebx,4), %ebp	C and one above it
+
+	shrdl(	%cl, %ebp, %eax)
+
+	subl	%edx, %eax		C carry limb
+
+	sbbl	%ebp, %ebp
+
+L(even_entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebx,4)
+	incl	%ebx
+
+	jnz	L(even_top)
+
+
+
+	mull	PARAM_DIVISOR
+
+	movl	-4(%esi), %eax		C src high limb
+	subl	%ebp, %edx
+
+	shrl	%cl, %eax
+
+	subl	%edx, %eax		C no carry if division is exact
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)		C dst high limb
+	nop				C protect against cache bank clash
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000..befa6e2
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h
@@ -0,0 +1,76 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 166MHz P54 */
+
+/* Generated by tuneup.c, 2004-02-10, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD             16
+#define MUL_TOOM33_THRESHOLD             90
+
+#define SQR_BASECASE_THRESHOLD            0  /* always */
+#define SQR_TOOM2_THRESHOLD              22
+#define SQR_TOOM3_THRESHOLD             122
+
+#define DIV_SB_PREINV_THRESHOLD       MP_SIZE_T_MAX  /* never */
+#define DIV_DC_THRESHOLD                 52
+#define POWM_THRESHOLD                   77
+
+#define HGCD_THRESHOLD                  121
+#define GCD_ACCEL_THRESHOLD               3
+#define GCD_DC_THRESHOLD                615
+#define JACOBI_BASE_METHOD                2
+
+#define USE_PREINV_DIVREM_1               0
+#define USE_PREINV_MOD_1                  1  /* native */
+#define DIVREM_2_THRESHOLD            MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD              0  /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD          0  /* always (native) */
+
+#define GET_STR_DC_THRESHOLD             23
+#define GET_STR_PRECOMPUTE_THRESHOLD     33
+#define SET_STR_THRESHOLD              2788
+
+#define MUL_FFT_TABLE  { 432, 928, 1664, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD          448
+#define MUL_FFT_THRESHOLD              3328
+
+#define SQR_FFT_TABLE  { 496, 928, 1920, 4608, 10240, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD          512
+#define SQR_FFT_THRESHOLD              3328
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm
new file mode 100644
index 0000000..6c6c1a1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm
@@ -0,0 +1,154 @@
+dnl  Intel P5 mpn_hamdist -- mpn hamming distance.
+
+dnl  Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
+C
+C It might be possible to shave 1 cycle from the loop, and hence 2
+C cycles/limb.  The xorb is taking 2 cycles, but a separate load and xor
+C would be 1, if the right schedule could be found (not found so far).
+C Wanting to avoid potential cache bank clashes makes it tricky.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental
+C linking.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC1, 4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_hamdist)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi	FRAME_pushl()
+
+	shll	%ecx		C size in byte pairs
+	pushl	%edi	FRAME_pushl()
+
+ifdef(`PIC',`
+	pushl	%ebx	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+ifdef(`DARWIN',`
+	movl	PARAM_SRC1, %esi
+	movl	PARAM_SRC2, %edi
+	LEA(	TABLE_NAME, %ebp)
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+	xorl	%eax, %eax	C total
+',`
+	call	L(here)	FRAME_pushl()
+L(here):
+	movl	PARAM_SRC1, %esi
+	popl	%ebp	FRAME_popl()
+
+	movl	PARAM_SRC2, %edi
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+
+	movl	TABLE_NAME@GOT(%ebp), %ebp
+	xorl	%eax, %eax	C total
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+	movl	PARAM_SRC1, %esi
+	movl	PARAM_SRC2, %edi
+
+	xorl	%eax, %eax	C total
+	pushl	%ebx	FRAME_pushl()
+
+	xorl	%edx, %edx	C byte
+	xorl	%ebx, %ebx	C byte
+
+define(TABLE,`TABLE_NAME($1)')
+')
+
+
+	C The nop after the xorb seems necessary.  Although a movb might be
+	C expected to go down the V pipe in the second cycle of the xorb, it
+	C doesn't and costs an extra 2 cycles.
+L(top):
+	C eax	total
+	C ebx	byte
+	C ecx	counter, 2*size to 2
+	C edx	byte
+	C esi	src1
+	C edi	src2
+	C ebp	[PIC] table
+
+	addl	%ebx, %eax
+	movb	-1(%esi,%ecx,2), %bl
+
+	addl	%edx, %eax
+	movb	-1(%edi,%ecx,2), %dl
+
+	xorb	%dl, %bl
+	movb	-2(%esi,%ecx,2), %dl
+
+	xorb	-2(%edi,%ecx,2), %dl
+	nop
+
+	movb	TABLE(%ebx), %bl
+	decl	%ecx
+
+	movb	TABLE(%edx), %dl
+	jnz	L(top)
+
+
+ifdef(`PIC',`
+	popl	%ebp
+')
+	addl	%ebx, %eax
+	popl	%ebx
+
+	addl	%edx, %eax
+	popl	%edi
+
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm
new file mode 100644
index 0000000..1877317
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm
@@ -0,0 +1,176 @@
+dnl  Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 3.0 c/l  and, ior, xor
+C     3.5 c/l  andn, iorn, nand, nior, xnor
+
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4op',        `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n,     , andl,    )
+M4_choose_op( andn_n,    , andl, yes)
+M4_choose_op( nand_n, yes, andl,    )
+M4_choose_op( ior_n,     ,  orl,    )
+M4_choose_op( iorn_n,    ,  orl, yes)
+M4_choose_op( nior_n, yes,  orl,    )
+M4_choose_op( xor_n,     , xorl,    )
+M4_choose_op( xnor_n, yes, xorl,    )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+NAILS_SUPPORT(0-31)
+
+
+C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size);
+C
+C Nothing complicated here, just some care to avoid data cache bank clashes
+C and AGIs.
+C
+C We're one register short of being able to do a simple 4 loads, 2 ops, 2
+C stores.  Instead %ebp is juggled a bit and nops are introduced to keep the
+C pairings as intended.  An in-place operation would free up a register, for
+C an 0.5 c/l speedup, if that's worth bothering with.
+C
+C This code seems best for P55 too.  Data alignment is a big problem for MMX
+C and the pairing restrictions on movq and integer instructions make life
+C difficult.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_YP,  12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_XP, %ebx
+
+	movl	PARAM_YP, %esi
+	movl	PARAM_WP, %edi
+
+	shrl	%ecx
+	jnc	L(entry)
+
+	movl	(%ebx,%ecx,8), %eax	C risk of data cache bank clash here
+	movl	(%esi,%ecx,8), %edx
+
+M4pre(`	notl_or_xorl_GMP_NUMB_MASK(%edx)')
+
+	M4op	%edx, %eax
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+	orl	%ecx, %ecx
+
+	movl	%eax, (%edi,%ecx,8)
+	jz	L(done)
+
+	jmp	L(entry)
+
+
+L(top):
+	C eax
+	C ebx	xp
+	C ecx	counter, limb pairs, decrementing
+	C edx
+	C esi	yp
+	C edi	wp
+	C ebp
+
+	M4op	%ebp, %edx
+	nop
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+M4post(`xorl	$GMP_NUMB_MASK, %edx')
+
+	movl	%eax, 4(%edi,%ecx,8)
+	movl	%edx, (%edi,%ecx,8)
+
+L(entry):
+	movl	-4(%ebx,%ecx,8), %ebp
+	nop
+
+	movl	-4(%esi,%ecx,8), %eax
+	movl	-8(%esi,%ecx,8), %edx
+
+M4pre(`	xorl	$GMP_NUMB_MASK, %eax')
+M4pre(`	xorl	$GMP_NUMB_MASK, %edx')
+
+	M4op	%ebp, %eax
+	movl	-8(%ebx,%ecx,8), %ebp
+
+	decl	%ecx
+	jnz	L(top)
+
+
+	M4op	%ebp, %edx
+	nop
+
+M4post(`xorl	$GMP_NUMB_MASK, %eax')
+M4post(`xorl	$GMP_NUMB_MASK, %edx')
+
+	movl	%eax, 4(%edi,%ecx,8)
+	movl	%edx, (%edi,%ecx,8)
+
+
+L(done):
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000..2a31f36
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm
@@ -0,0 +1,243 @@
+dnl  Intel Pentium mpn_lshift -- mpn left shift.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C P5,P54:    6.0
+C P55:       5.375
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%esi),%eax
+	cmpl	%edi,%eax
+	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
+	leal	(%esi,%ebp,4),%eax
+	cmpl	%eax,%edi
+	jnc	L(special)		C jump if res_ptr >= s_ptr + size
+
+L(normal):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+	xorl	%eax,%eax
+	shldl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	shldl(	%cl, %ebx, %edx)
+	shldl(	%cl, %eax, %ebx)
+	movl	%edx,-8(%edi)
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	shldl(	%cl, %edx, %eax)
+	shldl(	%cl, %ebx, %edx)
+	movl	%eax,-16(%edi)
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	shldl(	%cl, %eax, %ebx)
+	shldl(	%cl, %edx, %eax)
+	movl	%ebx,-24(%edi)
+	movl	%eax,-28(%edi)
+
+	subl	$32,%esi
+	subl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shldl(	%cl,%eax,%edx)
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	subl	$4,%esi
+	subl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shll	%cl,%edx		C compute least significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	movl	(%esi),%edx
+	addl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	addl	%edx,%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,(%edi)
+	adcl	%edx,%edx
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	adcl	%ebx,%ebx
+	movl	%edx,8(%edi)
+	adcl	%eax,%eax
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	adcl	%edx,%edx
+	movl	%eax,16(%edi)
+	adcl	%ebx,%ebx
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	adcl	%eax,%eax
+	movl	%ebx,24(%edi)
+	adcl	%edx,%edx
+	movl	%eax,28(%edi)
+
+	leal	32(%esi),%esi		C use leal not to clobber carry
+	leal	32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	adcl	%edx,%edx
+	movl	%ebx,(%edi)
+
+	leal	4(%esi),%esi		C use leal not to clobber carry
+	leal	4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	sbbl	%eax,%eax
+	negl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000..02a0def
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -0,0 +1,163 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 233MHz P55 */
+
+#define MOD_1_NORM_THRESHOLD                 5
+#define MOD_1_UNNORM_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD      MP_SIZE_T_MAX  /* never */
+#define MOD_1U_TO_MOD_1_1_THRESHOLD         12
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD         0
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        11
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD     63
+#define USE_PREINV_DIVREM_1                  0
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           51
+
+#define MUL_TOOM22_THRESHOLD                16
+#define MUL_TOOM33_THRESHOLD                53
+#define MUL_TOOM44_THRESHOLD               128
+#define MUL_TOOM6H_THRESHOLD               189
+#define MUL_TOOM8H_THRESHOLD               260
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD      89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD      91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD      90
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD      88
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 20
+#define SQR_TOOM3_THRESHOLD                 73
+#define SQR_TOOM4_THRESHOLD                178
+#define SQR_TOOM6_THRESHOLD                210
+#define SQR_TOOM8_THRESHOLD                375
+
+#define MULMOD_BNM1_THRESHOLD               11
+#define SQRMOD_BNM1_THRESHOLD               12
+
+#define MUL_FFT_MODF_THRESHOLD             364  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    364, 5}, {     15, 6}, {      8, 5}, {     17, 6}, \
+    {      9, 5}, {     19, 6}, {     17, 7}, {      9, 6}, \
+    {     21, 7}, {     11, 6}, {     23, 7}, {     15, 6}, \
+    {     31, 7}, {     21, 8}, {     11, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     39, 8}, \
+    {     23, 7}, {     47, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     47,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     79,10}, {     47, 9}, \
+    {     95,11}, {     31,10}, {     63, 9}, {    135,10}, \
+    {     79, 9}, {    159, 8}, {    319, 9}, {    167,10}, \
+    {     95, 9}, {    191, 8}, {    383,11}, {     63,10}, \
+    {    127, 9}, {    255,10}, {    143, 9}, {    287,10}, \
+    {    159, 9}, {    319,11}, {     95,10}, {    191, 9}, \
+    {    383,12}, {     63,11}, {    127,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    351,11}, \
+    {    191,10}, {    415,11}, {    223,12}, {    127,11}, \
+    {    255,10}, {    511,11}, {    287,10}, {    575,11}, \
+    {    351,12}, {    191,11}, {    415,13}, {    127,12}, \
+    {    255,11}, {    575,12}, {    319,11}, {    703,12}, \
+    {    383,11}, {    831,12}, {    447,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 90
+#define MUL_FFT_THRESHOLD                 3520
+
+#define SQR_FFT_MODF_THRESHOLD             340  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    340, 5}, {     17, 6}, {      9, 5}, {     19, 6}, \
+    {     17, 7}, {      9, 6}, {     21, 7}, {     11, 6}, \
+    {     23, 7}, {     15, 6}, {     31, 7}, {     21, 8}, \
+    {     11, 7}, {     29, 8}, {     15, 7}, {     33, 8}, \
+    {     19, 7}, {     39, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     65, 8}, {     43, 9}, \
+    {     23, 8}, {     47,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     83, 9}, {     47, 8}, \
+    {     95,10}, {     31, 9}, {     63, 8}, {    127, 9}, \
+    {     79,10}, {     47, 9}, {     95,11}, {     31,10}, \
+    {     63, 9}, {    127, 8}, {    255, 9}, {    135,10}, \
+    {     79, 9}, {    159, 8}, {    319,10}, {     95, 9}, \
+    {    191,11}, {     63,10}, {    127, 9}, {    255, 8}, \
+    {    511, 9}, {    271,10}, {    143, 9}, {    287, 8}, \
+    {    575, 9}, {    303,10}, {    159, 9}, {    319,11}, \
+    {     95,10}, {    191, 9}, {    383,10}, {    207,12}, \
+    {     63,11}, {    127,10}, {    271, 9}, {    543,10}, \
+    {    287, 9}, {    575,10}, {    303,11}, {    159,10}, \
+    {    351,11}, {    191,10}, {    415,11}, {    223,10}, \
+    {    447,12}, {    127,11}, {    255,10}, {    543,11}, \
+    {    287,10}, {    607,11}, {    351,12}, {    191,11}, \
+    {    479,13}, {    127,12}, {    255,11}, {    575,12}, \
+    {    319,11}, {    703,12}, {    383,11}, {    767,12}, \
+    {    447,13}, {   8192,14}, {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 96
+#define SQR_FFT_THRESHOLD                 5504
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  48
+#define MULLO_MUL_N_THRESHOLD             6633
+
+#define DC_DIV_QR_THRESHOLD                 43
+#define DC_DIVAPPR_Q_THRESHOLD             170
+#define DC_BDIV_QR_THRESHOLD                43
+#define DC_BDIV_Q_THRESHOLD                110
+
+#define INV_MULMOD_BNM1_THRESHOLD           30
+#define INV_NEWTON_THRESHOLD               177
+#define INV_APPR_THRESHOLD                 171
+
+#define BINV_NEWTON_THRESHOLD              194
+#define REDC_1_TO_REDC_N_THRESHOLD          50
+
+#define MU_DIV_QR_THRESHOLD               1142
+#define MU_DIVAPPR_Q_THRESHOLD            1142
+#define MUPI_DIV_QR_THRESHOLD               90
+#define MU_BDIV_QR_THRESHOLD               942
+#define MU_BDIV_Q_THRESHOLD               1017
+
+#define MATRIX22_STRASSEN_THRESHOLD         13
+#define HGCD_THRESHOLD                      92
+#define GCD_DC_THRESHOLD                   283
+#define GCDEXT_DC_THRESHOLD                221
+#define JACOBI_BASE_METHOD                   2
+
+#define GET_STR_DC_THRESHOLD                18
+#define GET_STR_PRECOMPUTE_THRESHOLD        31
+#define SET_STR_DC_THRESHOLD               490
+#define SET_STR_PRECOMPUTE_THRESHOLD       994
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm
new file mode 100644
index 0000000..72e3196
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm
@@ -0,0 +1,40 @@
+dnl  Intel P55 mpn_hamdist -- mpn hamming distance.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P55: hamdist 12.0 cycles/limb
+
+C For reference, this code runs at 11.5 cycles/limb for popcount, which is
+C slower than the plain integer mpn/x86/pentium/popcount.asm.
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000..04b0ddc
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm
@@ -0,0 +1,463 @@
+dnl  Intel P5 mpn_lshift -- mpn left shift.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right.  Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	movl	-4(%ebx,%eax,4), %edi	C src high limb
+	decl	%eax
+
+	jnz	L(simple)
+
+	shldl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+	shll	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx,%eax,4), %mm5	C src high limb
+
+	movd	%ecx, %mm6		C lshift
+	negl	%ecx
+
+	psllq	%mm6, %mm5
+	addl	$32, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	$32, %mm5		C retval
+
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+	C mm7	32-shift
+
+	movq	-4(%ebx,%eax,4), %mm0
+	decl	%eax
+
+	psrlq	%mm7, %mm0
+
+	C
+
+	movd	%mm0, 4(%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+
+	movd	%mm5, %eax
+	psllq	%mm6, %mm0
+
+	popl	%edi
+	popl	%ebx
+
+	movd	%mm0, (%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	-4(%ebx,%eax,4), %mm5	C src high limb
+	leal	(%ebx,%eax,4), %edi
+
+	movd	%ecx, %mm6		C lshift
+	andl	$4, %edi
+
+	psllq	%mm6, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process high limb separately (marked xxx) to
+	C make it so.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+-------+--
+	C  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+--
+	C  |  xxx  |       |
+	C  +-------+-------+--
+
+	movq	-8(%ebx,%eax,4), %mm0	C unaligned load
+
+	psllq	%mm6, %mm0
+	decl	%eax
+
+	psrlq	$32, %mm0
+
+	C
+
+	movd	%mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+	movq	-8(%ebx,%eax,4), %mm1	C src high qword
+	leal	(%edx,%eax,4), %edi
+
+	andl	$4, %edi
+	psrlq	$32, %mm5		C return value
+
+	movq	-16(%ebx,%eax,4), %mm3	C src second highest qword
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+	C is 32 bits extra.  High limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C  source     -8(ebx,%eax,4)
+	C                  |
+	C  +-------+-------+--
+	C  |      mm1      |
+	C  +-------+-------+--
+	C                0mod8   4mod8
+	C
+	C  dest
+	C     -4(edx,%eax,4)
+	C          |
+	C  +-------+-------+-------+--
+	C  |  xxx  |               |
+	C  +-------+-------+-------+--
+	C        0mod8   4mod8   0mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psllq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+	psrlq	$32, %mm0
+
+	C wasted cycle here waiting for %mm0
+
+	movd	%mm0, -4(%edx,%eax,4)
+	subl	$4, %edx
+L(start_dst_aligned):
+
+
+	psllq	%mm6, %mm1
+	negl	%ecx			C -shift
+
+	addl	$64, %ecx		C 64-shift
+	movq	%mm3, %mm2
+
+	movd	%ecx, %mm7
+	subl	$8, %eax		C size-8
+
+	psrlq	%mm7, %mm3
+
+	por	%mm1, %mm3		C mm3 ready to store
+	jc	L(finish)
+
+
+	C The comments in mpn_rshift apply here too.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs
+	C ebx	src
+	C ecx
+	C edx	dst
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from 16(%ebx,%eax,4)
+	C mm3	dst qword ready to store to 24(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	lshift
+	C mm7	rshift
+
+	movq	8(%ebx,%eax,4), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	movq	%mm3, 24(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	(%ebx,%eax,4), %mm3	C
+	psllq	%mm6, %mm1		C
+
+	movq	%mm0, 16(%edx,%eax,4)
+	movq	%mm3, %mm2		C
+
+	psrlq	%mm7, %mm3		C
+	subl	$4, %eax
+
+	por	%mm1, %mm3		C
+	jnc	L(unroll_loop)
+
+
+
+L(finish):
+	C eax	-4 to -1 representing respectively 0 to 3 limbs remaining
+
+	testb	$2, %al
+
+	jz	L(finish_no_two)
+
+	movq	8(%ebx,%eax,4), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	movq	%mm3, 24(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	subl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	-4 or -3 representing respectively 0 or 1 limbs remaining
+	C
+	C mm2	src prev qword, from 16(%ebx,%eax,4)
+	C mm3	dst qword, for 24(%edx,%eax,4)
+
+	testb	$1, %al
+	movd	%mm5, %eax	C retval
+
+	popl	%edi
+	jz	L(finish_zero)
+
+
+	C One extra src limb, destination was aligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C dest         edx+12           edx+4     edx
+	C --+---------------+---------------+-------+
+	C   |      mm3      |               |       |
+	C --+---------------+---------------+-------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra src limb, destination was unaligned.
+	C
+	C                 source                  ebx
+	C                 --+---------------+-------+
+	C                   |      mm2      |       |
+	C                 --+---------------+-------+
+	C
+	C         dest         edx+12           edx+4
+	C         --+---------------+---------------+
+	C           |      mm3      |               |
+	C         --+---------------+---------------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 4(%edx), and in the aligned case
+	C there's an extra limb of dst to be formed from that extra src limb
+	C left shifted.
+
+
+	movd	(%ebx), %mm0
+	psllq	%mm6, %mm2
+
+	movq	%mm3, 12(%edx)
+	psllq	$32, %mm0
+
+	movq	%mm0, %mm1
+	psrlq	%mm7, %mm0
+
+	por	%mm2, %mm0
+	psllq	%mm6, %mm1
+
+	movq	%mm0, 4(%edx)
+	psrlq	$32, %mm1
+
+	andl	$32, %ecx
+	popl	%ebx
+
+	jz	L(finish_one_unaligned)
+
+	movd	%mm1, (%edx)
+L(finish_one_unaligned):
+
+	emms
+
+	ret
+
+
+L(finish_zero):
+
+	C No extra src limbs, destination was aligned.
+	C
+	C                 source          ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C dest          edx+8             edx
+	C --+---------------+---------------+
+	C   |      mm3      |               |
+	C --+---------------+---------------+
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra src limbs, destination was unaligned.
+	C
+	C               source            ebx
+	C                 --+---------------+
+	C                   |      mm2      |
+	C                 --+---------------+
+	C
+	C         dest          edx+8   edx+4
+	C         --+---------------+-------+
+	C           |      mm3      |       |
+	C         --+---------------+-------+
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C The movd for the unaligned case writes the same data to 4(%edx)
+	C that the movq does for the aligned case.
+
+
+	movq	%mm3, 8(%edx)
+	andl	$32, %ecx
+
+	psllq	%mm6, %mm2
+	jz	L(finish_zero_unaligned)
+
+	movq	%mm2, (%edx)
+L(finish_zero_unaligned):
+
+	psrlq	$32, %mm2
+	popl	%ebx
+
+	movd	%mm5, %eax	C retval
+
+	movd	%mm2, 4(%edx)
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
new file mode 100644
index 0000000..4ced577
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
@@ -0,0 +1,371 @@
+dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    cycles/limb
+C P5:   12.0   for 32-bit multiplier
+C        7.0   for 16-bit multiplier
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C
+C When the multiplier is 16 bits some special case MMX code is used.  Small
+C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
+C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
+C size==8 end up being quite close.  If src isn't aligned to an 8 byte
+C boundary then one limb is processed separately with roughly a 5 cycle
+C penalty, so in that case it's say size==8 and size==9 which are close.
+C
+C Alternatives:
+C
+C MMX is not believed to be of any use for 32-bit multipliers, since for
+C instance the current method would just have to be more or less duplicated
+C for the high and low halves of the multiplier, and would probably
+C therefore run at about 14 cycles, which is slower than the plain integer
+C at 12.
+C
+C Adding the high and low MMX products using integer code seems best.  An
+C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
+C any joy.  Perhaps something could be done keeping the values signed and
+C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
+C perhaps not.
+C
+C Future:
+C
+C An mpn_mul_1c entrypoint would need a double carry out of the low result
+C limb in the 16-bit code, unless it could be assumed the carry fits in 16
+C bits, possibly as carry<multiplier, this being true of a big calculation
+C done piece by piece.  But let's worry about that if/when mul_1c is
+C actually used.
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	cmpl	$1, %ecx
+	jne	L(two_or_more)
+
+	C one limb only
+
+	movl	PARAM_MULTIPLIER, %eax
+	movl	PARAM_DST, %ecx
+
+	mull	(%edx)
+
+	movl	%eax, (%ecx)
+	movl	%edx, %eax
+
+	ret
+
+
+L(two_or_more):
+	C eax	size
+	C ebx
+	C ecx	carry
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%esi		FRAME_pushl()
+	pushl	%edi		FRAME_pushl()
+
+	movl	%edx, %esi		C src
+	movl	PARAM_DST, %edi
+
+	movl	PARAM_MULTIPLIER, %eax
+	pushl	%ebx		FRAME_pushl()
+
+	leal	(%esi,%ecx,4), %esi	C src end
+	leal	(%edi,%ecx,4), %edi	C dst end
+
+	negl	%ecx			C -size
+
+	pushl	%ebp		FRAME_pushl()
+	cmpl	$65536, %eax
+
+	jb	L(small)
+
+
+L(big):
+	xorl	%ebx, %ebx		C carry limb
+	sarl	%ecx			C -size/2
+
+	jnc	L(top)			C with carry flag clear
+
+
+	C size was odd, process one limb separately
+
+	mull	4(%esi,%ecx,8)		C m * src[0]
+
+	movl	%eax, 4(%edi,%ecx,8)
+	incl	%ecx
+
+	orl	%edx, %ebx		C carry limb, and clear carry flag
+
+
+L(top):
+	C eax
+	C ebx	carry
+	C ecx	counter, negative
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	(scratch carry)
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%edx, %ebp
+	addl	%eax, %ebx
+
+	adcl	$0, %ebp
+	movl	4(%esi,%ecx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%ebx, (%edi,%ecx,8)
+	addl	%ebp, %eax
+
+	movl	%eax, 4(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(top)
+
+
+	adcl	$0, %ebx
+	popl	%ebp
+
+	movl	%ebx, %eax
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+
+L(small):
+	C Special case for 16-bit multiplier.
+	C
+	C eax	multiplier
+	C ebx
+	C ecx	-size
+	C edx	src
+	C esi	src end
+	C edi	dst end
+	C ebp	multiplier
+
+	C size<3 not supported here.  At size==3 we're already a couple of
+	C cycles faster, so there's no threshold as such, just use the MMX
+	C as soon as possible.
+
+	cmpl	$-3, %ecx
+	ja	L(big)
+
+	movd	%eax, %mm7		C m
+	pxor	%mm6, %mm6		C initial carry word
+
+	punpcklwd %mm7, %mm7		C m replicated 2 times
+	addl	$2, %ecx		C -size+2
+
+	punpckldq %mm7, %mm7		C m replicated 4 times
+	andl	$4, %edx		C test alignment, clear carry flag
+
+	movq	%mm7, %mm0		C m
+	jz	L(small_entry)
+
+
+	C Source is unaligned, process one limb separately.
+	C
+	C Plain integer code is used here, since it's smaller and is about
+	C the same 13 cycles as an mmx block would be.
+	C
+	C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
+	C the use of separate incl and orl.
+
+	mull	-8(%esi,%ecx,4)		C m * src[0]
+
+	movl	%eax, -8(%edi,%ecx,4)	C dst[0]
+	incl	%ecx			C one limb processed
+
+	movd	%edx, %mm6		C initial carry
+
+	orl	%eax, %eax		C clear carry flag
+	jmp	L(small_entry)
+
+
+C The scheduling here is quite tricky, since so many instructions have
+C pairing restrictions.  In particular the js won't pair with a movd, and
+C can't be paired with an adc since it wants flags from the inc, so
+C instructions are rotated to the top of the loop to find somewhere useful
+C for it.
+C
+C Trouble has been taken to avoid overlapping successive loop iterations,
+C since that would greatly increase the size of the startup and finishup
+C code.  Actually there's probably not much advantage to be had from
+C overlapping anyway, since the difficulties are mostly with pairing, not
+C with latencies as such.
+C
+C In the comments x represents the src data and m the multiplier (16
+C bits, but replicated 4 times).
+C
+C The m signs calculated in %mm3 are a loop invariant and could be held in
+C say %mm5, but that would save only one instruction and hence be no faster.
+
+L(small_top):
+	C eax	l.low, then l.high
+	C ebx	(h.low)
+	C ecx	counter, -size+2 to 0 or 1
+	C edx	(h.high)
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+	C
+	C %mm0	(high products)
+	C %mm1	(low products)
+	C %mm2	(adjust for m using x signs)
+	C %mm3	(adjust for x using m signs)
+	C %mm4
+	C %mm5
+	C %mm6	h.low, then carry
+	C %mm7	m replicated 4 times
+
+	movd	%mm6, %ebx		C h.low
+	psrlq	$32, %mm1		C l.high
+
+	movd	%mm0, %edx		C h.high
+	movq	%mm0, %mm6		C new c
+
+	adcl	%eax, %ebx
+	incl	%ecx
+
+	movd	%mm1, %eax		C l.high
+	movq	%mm7, %mm0
+
+	adcl	%eax, %edx
+	movl	%ebx, -16(%edi,%ecx,4)
+
+	movl	%edx, -12(%edi,%ecx,4)
+	psrlq	$32, %mm6		C c
+
+L(small_entry):
+	pmulhw	-8(%esi,%ecx,4), %mm0	C h = (x*m).high
+	movq	%mm7, %mm1
+
+	pmullw	-8(%esi,%ecx,4), %mm1	C l = (x*m).low
+	movq	%mm7, %mm3
+
+	movq	-8(%esi,%ecx,4), %mm2	C x
+	psraw	$15, %mm3		C m signs
+
+	pand	-8(%esi,%ecx,4), %mm3	C x selected by m signs
+	psraw	$15, %mm2		C x signs
+
+	paddw	%mm3, %mm0		C add x to h if m neg
+	pand	%mm7, %mm2		C m selected by x signs
+
+	paddw	%mm2, %mm0		C add m to h if x neg
+	incl	%ecx
+
+	movd	%mm1, %eax		C l.low
+	punpcklwd %mm0, %mm6		C c + h.low << 16
+
+	psrlq	$16, %mm0		C h.high
+	js	L(small_top)
+
+
+
+
+	movd	%mm6, %ebx		C h.low
+	psrlq	$32, %mm1		C l.high
+
+	adcl	%eax, %ebx
+	popl	%ebp		FRAME_popl()
+
+	movd	%mm0, %edx		C h.high
+	psrlq	$32, %mm0		C l.high
+
+	movd	%mm1, %eax		C l.high
+
+	adcl	%eax, %edx
+	movl	%ebx, -12(%edi,%ecx,4)
+
+	movd	%mm0, %eax		C c
+
+	adcl	$0, %eax
+	movl	%edx, -8(%edi,%ecx,4)
+
+	orl	%ecx, %ecx
+	jnz	L(small_done)		C final %ecx==1 means even, ==0 odd
+
+
+	C Size odd, one extra limb to process.
+	C Plain integer code is used here, since it's smaller and is about
+	C the same speed as another mmx block would be.
+
+	movl	%eax, %ecx
+	movl	PARAM_MULTIPLIER, %eax
+
+	mull	-4(%esi)
+
+	addl	%ecx, %eax
+
+	adcl	$0, %edx
+	movl	%eax, -4(%edi)
+
+	movl	%edx, %eax
+L(small_done):
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000..e3b274b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,468 @@
+dnl  Intel P5 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned.  Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+	pushl	%ebx
+	pushl	%edi
+deflit(`FRAME',8)
+
+	movl	PARAM_SIZE, %eax
+	movl	PARAM_DST, %edx
+
+	movl	PARAM_SRC, %ebx
+	movl	PARAM_SHIFT, %ecx
+
+	cmp	$UNROLL_THRESHOLD, %eax
+	jae	L(unroll)
+
+	decl	%eax
+	movl	(%ebx), %edi		C src low limb
+
+	jnz	L(simple)
+
+	shrdl(	%cl, %edi, %eax)	C eax was decremented to zero
+
+	shrl	%cl, %edi
+
+	movl	%edi, (%edx)		C dst low limb
+	popl	%edi			C risk of data cache bank clash
+
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(simple):
+	C eax	size-1
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	leal	(%ebx,%eax,4), %ebx	C &src[size-1]
+
+	movd	%ecx, %mm6		C rshift
+	leal	-4(%edx,%eax,4), %edx	C &dst[size-2]
+
+	psllq	$32, %mm5
+	negl	%eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop.  Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-1]
+	C ecx	return value
+	C edx	&dst[size-2]
+	C
+	C mm0	scratch
+	C mm5	return value
+	C mm6	shift
+
+	movq	(%ebx,%eax,4), %mm0
+	incl	%eax
+
+	psrlq	%mm6, %mm0
+
+	movd	%mm0, (%edx,%eax,4)
+	jnz	L(simple_top)
+
+
+	movd	(%ebx), %mm0
+	psrlq	%mm6, %mm5		C return value
+
+	psrlq	%mm6, %mm0
+	popl	%edi
+
+	movd	%mm5, %eax
+	popl	%ebx
+
+	movd	%mm0, 4(%edx)
+
+	emms
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(unroll):
+	C eax	size
+	C ebx	src
+	C ecx	shift
+	C edx	dst
+	C esi
+	C edi
+	C ebp
+deflit(`FRAME',8)
+
+	movd	(%ebx), %mm5		C src[0]
+	movl	$4, %edi
+
+	movd	%ecx, %mm6		C rshift
+	testl	%edi, %ebx
+
+	psllq	$32, %mm5
+	jz	L(start_src_aligned)
+
+
+	C src isn't aligned, process low limb separately (marked xxx) and
+	C step src and dst by one limb, making src aligned.
+	C
+	C source                  ebx
+	C --+-------+-------+-------+
+	C           |          xxx  |
+	C --+-------+-------+-------+
+	C         4mod8   0mod8   4mod8
+	C
+	C         dest            edx
+	C         --+-------+-------+
+	C           |       |  xxx  |
+	C         --+-------+-------+
+
+	movq	(%ebx), %mm0		C unaligned load
+
+	psrlq	%mm6, %mm0
+	addl	$4, %ebx
+
+	decl	%eax
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_src_aligned):
+
+
+	movq	(%ebx), %mm1
+	testl	%edi, %edx
+
+	psrlq	%mm6, %mm5		C retval
+	jz	L(start_dst_aligned)
+
+	C dst isn't aligned, add 4 to make it so, and pretend the shift is
+	C 32 bits extra.  Low limb of dst (marked xxx) handled here
+	C separately.
+	C
+	C          source          ebx
+	C          --+-------+-------+
+	C            |      mm1      |
+	C          --+-------+-------+
+	C                  4mod8   0mod8
+	C
+	C  dest                    edx
+	C  --+-------+-------+-------+
+	C                    |  xxx  |
+	C  --+-------+-------+-------+
+	C          4mod8   0mod8   4mod8
+
+	movq	%mm1, %mm0
+	addl	$32, %ecx		C new shift
+
+	psrlq	%mm6, %mm0
+
+	movd	%ecx, %mm6
+
+	movd	%mm0, (%edx)
+	addl	$4, %edx
+L(start_dst_aligned):
+
+
+	movq	8(%ebx), %mm3
+	negl	%ecx
+
+	movq	%mm3, %mm2		C mm2 src qword
+	addl	$64, %ecx
+
+	movd	%ecx, %mm7
+	psrlq	%mm6, %mm1
+
+	leal	-12(%ebx,%eax,4), %ebx
+	leal	-20(%edx,%eax,4), %edx
+
+	psllq	%mm7, %mm3
+	subl	$7, %eax		C size-7
+
+	por	%mm1, %mm3		C mm3 ready to store
+	negl	%eax			C -(size-7)
+
+	jns	L(finish)
+
+
+	C This loop is the important bit, the rest is just support.  Careful
+	C instruction scheduling achieves the claimed 1.75 c/l.  The
+	C relevant parts of the pairing rules are:
+	C
+	C - mmx loads and stores execute only in the U pipe
+	C - only one mmx shift in a pair
+	C - wait one cycle before storing an mmx register result
+	C - the usual address generation interlock
+	C
+	C Two qword calculations are slightly interleaved.  The instructions
+	C marked "C" belong to the second qword, and the "C prev" one is for
+	C the second qword from the previous iteration.
+
+	ALIGN(8)
+L(unroll_loop):
+	C eax	counter, limbs, negative
+	C ebx	&src[size-12]
+	C ecx
+	C edx	&dst[size-12]
+	C esi
+	C edi
+	C
+	C mm0
+	C mm1
+	C mm2	src qword from -8(%ebx,%eax,4)
+	C mm3	dst qword ready to store to -8(%edx,%eax,4)
+	C
+	C mm5	return value
+	C mm6	rshift
+	C mm7	lshift
+
+	movq	(%ebx,%eax,4), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, -8(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	8(%ebx,%eax,4), %mm3	C
+	psrlq	%mm6, %mm1		C
+
+	movq	%mm0, (%edx,%eax,4)
+	movq	%mm3, %mm2		C
+
+	psllq	%mm7, %mm3		C
+	addl	$4, %eax
+
+	por	%mm1, %mm3		C
+	js	L(unroll_loop)
+
+
+L(finish):
+	C eax	0 to 3 representing respectively 3 to 0 limbs remaining
+
+	testb	$2, %al
+
+	jnz	L(finish_no_two)
+
+	movq	(%ebx,%eax,4), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, -8(%edx,%eax,4)	C prev
+	por	%mm2, %mm0
+
+	movq	%mm1, %mm2
+	movq	%mm0, %mm3
+
+	addl	$2, %eax
+L(finish_no_two):
+
+
+	C eax	2 or 3 representing respectively 1 or 0 limbs remaining
+	C
+	C mm2	src prev qword, from -8(%ebx,%eax,4)
+	C mm3	dst qword, for -8(%edx,%eax,4)
+
+	testb	$1, %al
+	popl	%edi
+
+	movd	%mm5, %eax	C retval
+	jnz	L(finish_zero)
+
+
+	C One extra limb, destination was aligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                                  edx
+	C +-------+---------------+---------------+--
+	C |       |               |      mm3      |
+	C +-------+---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C One extra limb, destination was unaligned.
+	C
+	C source                ebx
+	C +-------+---------------+--
+	C |       |      mm2      |
+	C +-------+---------------+--
+	C
+	C dest                          edx
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = ecx = 64-(shift+32)
+
+
+	C In both cases there's one extra limb of src to fetch and combine
+	C with mm2 to make a qword at 8(%edx), and in the aligned case
+	C there's a further extra limb of dst to be formed.
+
+
+	movd	8(%ebx), %mm0
+	psrlq	%mm6, %mm2
+
+	movq	%mm0, %mm1
+	psllq	%mm7, %mm0
+
+	movq	%mm3, (%edx)
+	por	%mm2, %mm0
+
+	psrlq	%mm6, %mm1
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_one_unaligned)
+
+	C dst was aligned, must store one extra limb
+	movd	%mm1, 16(%edx)
+L(finish_one_unaligned):
+
+	movq	%mm0, 8(%edx)
+
+	emms
+
+	ret
+
+
+L(finish_zero):
+
+	C No extra limbs, destination was aligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                        edx+4
+	C +---------------+---------------+--
+	C |               |      mm3      |
+	C +---------------+---------------+--
+	C
+	C mm6 = shift
+	C mm7 = ecx = 64-shift
+
+
+	C No extra limbs, destination was unaligned.
+	C
+	C source        ebx
+	C +---------------+--
+	C |      mm2      |
+	C +---------------+--
+	C
+	C dest                edx+4
+	C +-------+---------------+--
+	C |       |      mm3      |
+	C +-------+---------------+--
+	C
+	C mm6 = shift+32
+	C mm7 = 64-(shift+32)
+
+
+	C The movd for the unaligned case is clearly the same data as the
+	C movq for the aligned case, it's just a choice between whether one
+	C or two limbs should be written.
+
+
+	movq	%mm3, 4(%edx)
+	psrlq	%mm6, %mm2
+
+	movd	%mm2, 12(%edx)
+	andl	$32, %ecx
+
+	popl	%ebx
+	jz	L(finish_zero_unaligned)
+
+	movq	%mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+	emms
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm
new file mode 100644
index 0000000..2d88223
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm
@@ -0,0 +1,192 @@
+dnl  Intel P5 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
+
+dnl  Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.66 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC,  4)
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	movl	PARAM_SRC, %edx
+
+	subl	$2, %ecx
+	ja	L(three_or_more)
+
+	movl	(%edx), %eax
+	jne	L(one)
+
+
+	movl	4(%edx), %ecx
+	movl	%eax, %edx
+
+	shrl	$24, %edx
+	andl	$0xFFFFFF, %eax
+
+	addl	%edx, %eax
+	movl	%ecx, %edx
+
+	shrl	$16, %ecx
+	andl	$0xFFFF, %edx
+
+	shll	$8, %edx
+	addl	%ecx, %eax
+
+	addl	%edx, %eax
+
+L(one):
+	ret
+
+
+L(three_or_more):
+	C eax
+	C ebx
+	C ecx	size-2
+	C edx	src
+	C esi
+	C edi
+	C ebp
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%esi	FRAME_pushl()
+
+	pushl	%edi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	xorl	%esi, %esi		C 0mod3
+	xorl	%edi, %edi		C 1mod3
+
+	xorl	%ebp, %ebp		C 2mod3, and clear carry
+
+L(top):
+	C eax	scratch
+	C ebx	scratch
+	C ecx	counter, limbs
+	C edx	src
+	C esi	0mod3
+	C edi	1mod3
+	C ebp	2mod3
+
+	movl	(%edx), %eax
+	movl	4(%edx), %ebx
+
+	adcl	%eax, %esi
+	movl	8(%edx), %eax
+
+	adcl	%ebx, %edi
+	leal	12(%edx), %edx
+
+	adcl	%eax, %ebp
+	leal	-2(%ecx), %ecx
+
+	decl	%ecx
+	jg	L(top)
+
+
+	C ecx is -2, -1 or 0, representing 0, 1 or 2 more limbs, respectively
+
+	movl	$0xFFFFFFFF, %ebx	C mask
+	incl	%ecx
+
+	js	L(combine)		C 0 more
+
+	movl	(%edx), %eax
+	movl	$0xFFFFFF00, %ebx
+
+	adcl	%eax, %esi
+	decl	%ecx
+
+	js	L(combine)		C 1 more
+
+	movl	4(%edx), %eax
+	movl	$0xFFFF0000, %ebx
+
+	adcl	%eax, %edi
+
+
+
+L(combine):
+	C eax
+	C ebx	mask
+	C ecx
+	C edx
+	C esi	0mod3
+	C edi	1mod3
+	C ebp	2mod3
+
+	sbbl	%ecx, %ecx		C carry
+	movl	%esi, %eax		C 0mod3
+
+	andl	%ebx, %ecx		C masked for position
+	andl	$0xFFFFFF, %eax		C 0mod3 low
+
+	shrl	$24, %esi		C 0mod3 high
+	subl	%ecx, %eax		C apply carry
+
+	addl	%esi, %eax		C apply 0mod3
+	movl	%edi, %ebx		C 1mod3
+
+	shrl	$16, %edi		C 1mod3 high
+	andl	$0x0000FFFF, %ebx
+
+	shll	$8, %ebx		C 1mod3 low
+	addl	%edi, %eax		C apply 1mod3 high
+
+	addl	%ebx, %eax		C apply 1mod3 low
+	movl	%ebp, %ebx		C 2mod3
+
+	shrl	$8, %ebp		C 2mod3 high
+	andl	$0xFF, %ebx
+
+	shll	$16, %ebx		C 2mod3 low
+	addl	%ebp, %eax		C apply 2mod3 high
+
+	addl	%ebx, %eax		C apply 2mod3 low
+
+	popl	%ebp
+	popl	%edi
+
+	popl	%esi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm
new file mode 100644
index 0000000..a90abca
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm
@@ -0,0 +1,279 @@
+dnl  Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
+
+dnl  Copyright 2000-2002, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 23.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C                               mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t carry);
+C
+C There seems no way to pair up the two lone instructions in the main loop.
+C
+C The special case for size==1 saves about 20 cycles (non-PIC), making it
+C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
+C all sizes.
+C
+C Alternatives:
+C
+C Using mmx for the multiplies might be possible, with pmullw and pmulhw
+C having just 3 cycle latencies, but carry bit handling would probably be
+C complicated.
+
+defframe(PARAM_CARRY,  16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,   8)
+defframe(PARAM_SRC,    4)
+
+dnl  re-using parameter space
+define(VAR_INVERSE,`PARAM_SIZE')
+
+	TEXT
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	movl	PARAM_CARRY, %edx
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	xorl	%edx, %edx		C carry
+
+L(start_1c):
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+	shrl	%eax			C d/2
+	LEA(	binvert_limb_table, %ecx)
+	pushl	%ebx		FRAME_pushl()
+	movl	PARAM_SIZE, %ebx
+
+	andl	$127, %eax
+	subl	$2, %ebx
+
+	movb	(%eax,%ecx), %cl
+	jc	L(one_limb)
+',`
+	call	L(here)		FRAME_pushl()
+L(here):
+
+	shrl	%eax			C d/2
+	movl	(%esp), %ecx		C eip
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
+	movl	%ebx, (%esp)		C push ebx
+
+	andl	$127, %eax
+	movl	PARAM_SIZE, %ebx
+
+	movl	binvert_limb_table@GOT(%ecx), %ecx
+	subl	$2, %ebx
+
+	movb	(%eax,%ecx), %cl			C inv 8 bits
+	jc	L(one_limb)
+')
+',`
+dnl non-PIC
+	shrl	%eax			C d/2
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_SIZE, %ebx
+	andl	$127, %eax
+
+	subl	$2, %ebx
+	jc	L(one_limb)
+
+	movb	binvert_limb_table(%eax), %cl		C inv 8 bits
+')
+
+	movl	%ecx, %eax
+	addl	%ecx, %ecx		C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
+
+	movl	%ecx, %eax
+	addl	%ecx, %ecx		C 2*inv
+
+	imull	%eax, %eax		C inv*inv
+
+	imull	PARAM_DIVISOR, %eax	C inv*inv*d
+
+	subl	%eax, %ecx		C inv = 2*inv - inv*inv*d
+	pushl	%esi		FRAME_pushl()
+
+	ASSERT(e,`	C d*inv == 1 mod 2^GMP_LIMB_BITS
+	movl	%ecx, %eax
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax')
+
+	movl	PARAM_SRC, %esi
+	movl	%ecx, VAR_INVERSE
+
+	movl	(%esi), %eax		C src[0]
+	leal	4(%esi,%ebx,4), %esi	C &src[size-1]
+
+	xorl	$-1, %ebx		C -(size-1)
+	ASSERT(nz)
+	jmp	L(entry)
+
+
+C The use of VAR_INVERSE means only a store is needed for that value, rather
+C than a push and pop of say %edi.
+
+	ALIGN(16)
+L(top):
+	C eax	scratch, low product
+	C ebx	counter, limbs, negative
+	C ecx	carry bit
+	C edx	scratch, high product
+	C esi	&src[size-1]
+	C edi
+	C ebp
+
+	mull	PARAM_DIVISOR		C h:dummy = q*d
+
+	movl	(%esi,%ebx,4), %eax	C src[i]
+	subl	%ecx, %edx		C h -= -c
+
+L(entry):
+	subl	%edx, %eax		C s = src[i] - h
+
+	sbbl	%ecx, %ecx		C new -c (0 or -1)
+
+	imull	VAR_INVERSE, %eax	C q = s*i
+
+	incl	%ebx
+	jnz	L(top)
+
+
+	mull	PARAM_DIVISOR
+
+	movl	(%esi), %eax		C src high
+	subl	%ecx, %edx		C h -= -c
+
+	cmpl	PARAM_DIVISOR, %eax
+
+	jbe	L(skip_last)
+deflit(FRAME_LAST,FRAME)
+
+
+	subl	%edx, %eax		C s = src[i] - h
+	popl	%esi		FRAME_popl()
+
+	sbbl	%ecx, %ecx		C c (0 or -1)
+	popl	%ebx		FRAME_popl()
+
+	imull	VAR_INVERSE, %eax	C q = s*i
+
+	mull	PARAM_DIVISOR		C h:dummy = q*d
+
+	movl	%edx, %eax
+
+	subl	%ecx, %eax
+
+	ret
+
+
+C When high<divisor can skip last step.
+
+L(skip_last):
+deflit(`FRAME',FRAME_LAST)
+	C eax	src high
+	C ebx
+	C ecx
+	C edx	r
+	C esi
+
+	subl	%eax, %edx	C r-s
+	popl	%esi		FRAME_popl()
+
+	sbbl	%eax, %eax	C -1 if underflow
+	movl	PARAM_DIVISOR, %ebx
+
+	andl	%ebx, %eax	C divisor if underflow
+	popl	%ebx		FRAME_popl()
+
+	addl	%edx, %eax	C addback if underflow
+
+	ret
+
+
+C Special case for size==1 using a division for r = c-a mod d.
+C Could look for a-c<d and save a division sometimes, but that doesn't seem
+C worth bothering about.
+
+L(one_limb):
+deflit(`FRAME',4)
+	C eax
+	C ebx	size-2 (==-1)
+	C ecx
+	C edx	carry
+	C esi	src end
+	C edi
+	C ebp
+
+	movl	%edx, %eax
+	movl	PARAM_SRC, %edx
+
+	movl	PARAM_DIVISOR, %ecx
+	popl	%ebx		FRAME_popl()
+
+	subl	(%edx), %eax		C c-a
+
+	sbbl	%edx, %edx
+	decl	%ecx			C d-1
+
+	andl	%ecx, %edx		C b*d+c-a if c<a, or c-a if c>=a
+
+	divl	PARAM_DIVISOR
+
+	movl	%edx, %eax
+
+	ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000..a0858af
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm
@@ -0,0 +1,177 @@
+dnl  Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 12.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       mp_limb_t multiplier, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY,     20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+
+	movl	PARAM_CARRY, %ecx
+	pushl	%esi		FRAME_pushl()
+
+	jmp	L(start_1c)
+
+EPILOGUE()
+
+
+	ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+	xorl	%ecx, %ecx
+	pushl	%esi		FRAME_pushl()
+
+L(start_1c):
+	movl	PARAM_SRC, %esi
+	movl	PARAM_SIZE, %eax
+
+	shrl	%eax
+	jnz	L(two_or_more)
+
+
+	C one limb only
+
+	movl	(%esi), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%eax, %ecx
+	movl	PARAM_DST, %eax
+
+	adcl	$0, %edx
+	popl	%esi
+
+	movl	%ecx, (%eax)
+	movl	%edx, %eax
+
+	ret
+
+
+L(two_or_more):
+	C eax	size/2
+	C ebx
+	C ecx	carry
+	C edx
+	C esi	src
+	C edi
+	C ebp
+
+	pushl	%edi		FRAME_pushl()
+	pushl	%ebx		FRAME_pushl()
+
+	movl	PARAM_DST, %edi
+	leal	-1(%eax), %ebx		C size/2-1
+
+	notl	%ebx			C -size, preserve carry
+
+	leal	(%esi,%eax,8), %esi	C src end
+	leal	(%edi,%eax,8), %edi	C dst end
+
+	pushl	%ebp		FRAME_pushl()
+	jnc	L(top)
+
+
+	C size was odd, process one limb separately
+
+	movl	(%esi,%ebx,8), %eax
+	addl	$4, %esi
+
+	mull	PARAM_MULTIPLIER
+
+	addl	%ecx, %eax
+	movl	%edx, %ecx
+
+	movl	%eax, (%edi,%ebx,8)
+	leal	4(%edi), %edi
+
+
+L(top):
+	C eax
+	C ebx	counter, negative
+	C ecx	carry
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp
+
+	adcl	$0, %ecx
+	movl	(%esi,%ebx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%edx, %ebp
+	addl	%eax, %ecx
+
+	adcl	$0, %ebp
+	movl	4(%esi,%ebx,8), %eax
+
+	mull	PARAM_MULTIPLIER
+
+	movl	%ecx, (%edi,%ebx,8)
+	addl	%ebp, %eax
+
+	movl	%eax, 4(%edi,%ebx,8)
+	incl	%ebx
+
+	movl	%edx, %ecx
+	jnz	L(top)
+
+
+	adcl	$0, %ecx
+	popl	%ebp
+
+	movl	%ecx, %eax
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm
new file mode 100644
index 0000000..4c7beb5
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm
@@ -0,0 +1,150 @@
+dnl  Intel Pentium mpn_mul_2 -- mpn by 2-limb multiplication.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 24.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_2 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_srcptr mult);
+C
+C At 24 c/l this is only 2 cycles faster than a separate mul_1 and addmul_1,
+C but has the advantage of making just one pass over the operands.
+C
+C There's not enough registers to use PARAM_MULT directly, so the multiplier
+C limbs are transferred to local variables on the stack.
+
+defframe(PARAM_MULT, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,   8)
+defframe(PARAM_DST,   4)
+
+dnl  re-use parameter space
+define(VAR_MULT_LOW, `PARAM_SRC')
+define(VAR_MULT_HIGH,`PARAM_DST')
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_2)
+deflit(`FRAME',0)
+
+	pushl	%esi		FRAME_pushl()
+	pushl	%edi		FRAME_pushl()
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	movl	PARAM_MULT, %eax
+	movl	PARAM_SIZE, %ecx
+
+	movl	4(%eax), %edx		C mult high
+	movl	(%eax), %eax		C mult low
+
+	movl	%eax, VAR_MULT_LOW
+	movl	%edx, VAR_MULT_HIGH
+
+	pushl	%ebx		FRAME_pushl()
+	pushl	%ebp		FRAME_pushl()
+
+	mull	(%esi)			C src[0] * mult[0]
+
+	movl	%eax, %ebp		C in case src==dst
+	movl	(%esi), %eax		C src[0]
+
+	movl	%ebp, (%edi)		C dst[0]
+	movl	%edx, %ebx		C initial low carry
+
+	xorl	%ebp, %ebp		C initial high carry
+	leal	(%edi,%ecx,4), %edi	C dst end
+
+	mull	VAR_MULT_HIGH		C src[0] * mult[1]
+
+	subl	$2, %ecx		C size-2
+	js	L(done)
+
+	leal	8(%esi,%ecx,4), %esi	C &src[size]
+	xorl	$-1, %ecx		C -(size-1)
+
+
+
+L(top):
+	C eax	low prod
+	C ebx	low carry
+	C ecx	counter, negative
+	C edx	high prod
+	C esi	src end
+	C edi	dst end
+	C ebp	high carry (0 or -1)
+
+	andl	$1, %ebp		C 1 or 0
+	addl	%eax, %ebx
+
+	adcl	%edx, %ebp
+	ASSERT(nc)
+	movl	(%esi,%ecx,4), %eax
+
+	mull	VAR_MULT_LOW
+
+	addl	%eax, %ebx		C low carry
+	movl	(%esi,%ecx,4), %eax
+
+	adcl	%ebp, %edx		C high carry
+	movl	%ebx, (%edi,%ecx,4)
+
+	sbbl	%ebp, %ebp		C new high carry, -1 or 0
+	movl	%edx, %ebx		C new low carry
+
+	mull	VAR_MULT_HIGH
+
+	incl	%ecx
+	jnz	L(top)
+
+
+L(done):
+	andl	$1, %ebp		C 1 or 0
+	addl	%ebx, %eax
+
+	adcl	%ebp, %edx
+	ASSERT(nc)
+	movl	%eax, (%edi)		C store carry low
+
+	movl	%edx, %eax		C return carry high
+
+	popl	%ebp
+	popl	%ebx
+
+	popl	%edi
+	popl	%esi
+
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000..e1d0f05
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm
@@ -0,0 +1,142 @@
+dnl  Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+
+dnl  Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.2 cycles/crossproduct (approx)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP,    16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP,    8)
+defframe(PARAM_WP,    4)
+
+defframe(VAR_COUNTER, -4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+	pushl	%eax			C dummy push for allocating stack slot
+	pushl	%esi
+	pushl	%ebp
+	pushl	%edi
+deflit(`FRAME',16)
+
+	movl	PARAM_XP,%esi
+	movl	PARAM_WP,%edi
+	movl	PARAM_YP,%ebp
+
+	movl	(%esi),%eax		C load xp[0]
+	mull	(%ebp)			C multiply by yp[0]
+	movl	%eax,(%edi)		C store to wp[0]
+	movl	PARAM_XSIZE,%ecx	C xsize
+	decl	%ecx			C If xsize = 1, ysize = 1 too
+	jz	L(done)
+
+	movl	PARAM_XSIZE,%eax
+	pushl	%ebx
+FRAME_pushl()
+	movl	%edx,%ebx
+	leal	(%esi,%eax,4),%esi	C make xp point at end
+	leal	(%edi,%eax,4),%edi	C offset wp by xsize
+	negl	%ecx			C negate j size/index for inner loop
+	xorl	%eax,%eax		C clear carry
+
+	ALIGN(8)
+L(oop1):	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax	C load next limb at xp[j]
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	%eax,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop1)
+
+	adcl	$0,%ebx
+	movl	PARAM_YSIZE,%eax
+	movl	%ebx,(%edi)		C most significant limb of product
+	addl	$4,%edi			C increment wp
+	decl	%eax
+	jz	L(skip)
+	movl	%eax,VAR_COUNTER	C set index i to ysize
+
+L(outer):
+	addl	$4,%ebp			C make ebp point to next y limb
+	movl	PARAM_XSIZE,%ecx
+	negl	%ecx
+	xorl	%ebx,%ebx
+
+	C code at 0x61 here, close enough to aligned
+L(oop2):
+	adcl	$0,%ebx
+	movl	(%esi,%ecx,4),%eax
+	mull	(%ebp)
+	addl	%ebx,%eax
+	movl	(%edi,%ecx,4),%ebx
+	adcl	$0,%edx
+	addl	%eax,%ebx
+	movl	%ebx,(%edi,%ecx,4)
+	incl	%ecx
+	movl	%edx,%ebx
+	jnz	L(oop2)
+
+	adcl	$0,%ebx
+
+	movl	%ebx,(%edi)
+	addl	$4,%edi
+	movl	VAR_COUNTER,%eax
+	decl	%eax
+	movl	%eax,VAR_COUNTER
+	jnz	L(outer)
+
+L(skip):
+	popl	%ebx
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	addl	$4,%esp
+	ret
+
+L(done):
+	movl	%edx,4(%edi)	C store to wp[1]
+	popl	%edi
+	popl	%ebp
+	popl	%esi
+	popl	%eax		C dummy pop for deallocating stack slot
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm
new file mode 100644
index 0000000..0e82144
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm
@@ -0,0 +1,146 @@
+dnl  Intel P5 mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 8.0 cycles/limb
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C
+C An arithmetic approach has been found to be slower than the table lookup,
+C due to needing too many instructions.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: exporting the table to hamdist is incorrect as it hurt incremental
+C linking.
+
+	RODATA
+	ALIGN(8)
+	GLOBL	TABLE_NAME
+TABLE_NAME:
+forloop(i,0,255,
+`	.byte	m4_popcount(i)
+')
+
+defframe(PARAM_SIZE,8)
+defframe(PARAM_SRC, 4)
+
+	TEXT
+	ALIGN(8)
+
+PROLOGUE(mpn_popcount)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %ecx
+	pushl	%esi	FRAME_pushl()
+
+ifdef(`PIC',`
+	pushl	%ebx	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+ifdef(`DARWIN',`
+	shll	%ecx		C size in byte pairs
+	LEA(	TABLE_NAME, %ebp)
+	movl	PARAM_SRC, %esi
+	xorl	%eax, %eax	C total
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+',`
+	call	L(here)
+L(here):
+	popl	%ebp
+	shll	%ecx		C size in byte pairs
+
+	addl	$_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+	movl	PARAM_SRC, %esi
+
+	xorl	%eax, %eax	C total
+	xorl	%ebx, %ebx	C byte
+
+	movl	TABLE_NAME@GOT(%ebp), %ebp
+	xorl	%edx, %edx	C byte
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+	shll	%ecx		C size in byte pairs
+	movl	PARAM_SRC, %esi
+
+	pushl	%ebx	FRAME_pushl()
+	xorl	%eax, %eax	C total
+
+	xorl	%ebx, %ebx	C byte
+	xorl	%edx, %edx	C byte
+
+define(TABLE,`TABLE_NAME`'($1)')
+')
+
+
+	ALIGN(8)	C necessary on P55 for claimed speed
+L(top):
+	C eax	total
+	C ebx	byte
+	C ecx	counter, 2*size to 2
+	C edx	byte
+	C esi	src
+	C edi
+	C ebp	[PIC] table
+
+	addl	%ebx, %eax
+	movb	-1(%esi,%ecx,2), %bl
+
+	addl	%edx, %eax
+	movb	-2(%esi,%ecx,2), %dl
+
+	movb	TABLE(%ebx), %bl
+	decl	%ecx
+
+	movb	TABLE(%edx), %dl
+	jnz	L(top)
+
+
+ifdef(`PIC',`
+	popl	%ebp
+')
+	addl	%ebx, %eax
+	popl	%ebx
+
+	addl	%edx, %eax
+	popl	%esi
+
+	ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000..2105c4c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm
@@ -0,0 +1,243 @@
+dnl  Intel Pentium mpn_rshift -- mpn right shift.
+
+dnl  Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C         cycles/limb
+C P5,P54:    6.0
+C P55:       5.375
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+	pushl	%ebp
+deflit(`FRAME',16)
+
+	movl	PARAM_DST,%edi
+	movl	PARAM_SRC,%esi
+	movl	PARAM_SIZE,%ebp
+	movl	PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+	cmp	$1,%ecx
+	jne	L(normal)
+	leal	4(%edi),%eax
+	cmpl	%esi,%eax
+	jnc	L(special)		C jump if res_ptr + 1 >= s_ptr
+	leal	(%edi,%ebp,4),%eax
+	cmpl	%eax,%esi
+	jnc	L(special)		C jump if s_ptr >= res_ptr + size
+
+L(normal):
+	movl	(%esi),%edx
+	addl	$4,%esi
+	xorl	%eax,%eax
+	shrdl(	%cl, %edx, %eax)	C compute carry limb
+	pushl	%eax			C push carry limb onto stack
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+	jz	L(end)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(oop):	movl	28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	4(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,(%edi)
+	movl	%eax,4(%edi)
+
+	movl	8(%esi),%ebx
+	movl	12(%esi),%eax
+	shrdl(	%cl, %ebx, %edx)
+	shrdl(	%cl, %eax, %ebx)
+	movl	%edx,8(%edi)
+	movl	%ebx,12(%edi)
+
+	movl	16(%esi),%edx
+	movl	20(%esi),%ebx
+	shrdl(	%cl, %edx, %eax)
+	shrdl(	%cl, %ebx, %edx)
+	movl	%eax,16(%edi)
+	movl	%edx,20(%edi)
+
+	movl	24(%esi),%eax
+	movl	28(%esi),%edx
+	shrdl(	%cl, %eax, %ebx)
+	shrdl(	%cl, %edx, %eax)
+	movl	%ebx,24(%edi)
+	movl	%eax,28(%edi)
+
+	addl	$32,%esi
+	addl	$32,%edi
+	decl	%ebp
+	jnz	L(oop)
+
+L(end):	popl	%ebp
+	andl	$7,%ebp
+	jz	L(end2)
+L(oop2):
+	movl	(%esi),%eax
+	shrdl(	%cl,%eax,%edx)		C compute result limb
+	movl	%edx,(%edi)
+	movl	%eax,%edx
+	addl	$4,%esi
+	addl	$4,%edi
+	decl	%ebp
+	jnz	L(oop2)
+
+L(end2):
+	shrl	%cl,%edx		C compute most significant limb
+	movl	%edx,(%edi)		C store it
+
+	popl	%eax			C pop carry limb
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+	leal	-4(%edi,%ebp,4),%edi
+	leal	-4(%esi,%ebp,4),%esi
+
+	movl	(%esi),%edx
+	subl	$4,%esi
+
+	decl	%ebp
+	pushl	%ebp
+	shrl	$3,%ebp
+
+	shrl	%edx
+	incl	%ebp
+	decl	%ebp
+	jz	L(Lend)
+
+	movl	(%edi),%eax		C fetch destination cache line
+
+	ALIGN(4)
+L(Loop):
+	movl	-28(%edi),%eax		C fetch destination cache line
+	movl	%edx,%ebx
+
+	movl	(%esi),%eax
+	movl	-4(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,(%edi)
+	rcrl	%edx
+	movl	%eax,-4(%edi)
+
+	movl	-8(%esi),%ebx
+	movl	-12(%esi),%eax
+	rcrl	%ebx
+	movl	%edx,-8(%edi)
+	rcrl	%eax
+	movl	%ebx,-12(%edi)
+
+	movl	-16(%esi),%edx
+	movl	-20(%esi),%ebx
+	rcrl	%edx
+	movl	%eax,-16(%edi)
+	rcrl	%ebx
+	movl	%edx,-20(%edi)
+
+	movl	-24(%esi),%eax
+	movl	-28(%esi),%edx
+	rcrl	%eax
+	movl	%ebx,-24(%edi)
+	rcrl	%edx
+	movl	%eax,-28(%edi)
+
+	leal	-32(%esi),%esi		C use leal not to clobber carry
+	leal	-32(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop)
+
+L(Lend):
+	popl	%ebp
+	sbbl	%eax,%eax		C save carry in %eax
+	andl	$7,%ebp
+	jz	L(Lend2)
+	addl	%eax,%eax		C restore carry from eax
+L(Loop2):
+	movl	%edx,%ebx
+	movl	(%esi),%edx
+	rcrl	%edx
+	movl	%ebx,(%edi)
+
+	leal	-4(%esi),%esi		C use leal not to clobber carry
+	leal	-4(%edi),%edi
+	decl	%ebp
+	jnz	L(Loop2)
+
+	jmp	L(L1)
+L(Lend2):
+	addl	%eax,%eax		C restore carry from eax
+L(L1):	movl	%edx,(%edi)		C store last limb
+
+	movl	$0,%eax
+	rcrl	%eax
+
+	popl	%ebp
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000..b11d767
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm
@@ -0,0 +1,528 @@
+dnl  Intel P5 mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+C product at around 20x20 limbs.
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+
+	movl	(%eax), %eax
+	ja	L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	mull	%eax
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	ret
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	pushl	%esi
+	pushl	%ebx
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	movl	%eax, (%ecx)	C dst[0]
+	movl	%edx, %esi	C dst[1]
+
+	movl	4(%ebx), %eax
+
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+	popl	%ebx
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+	popl	%esi
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+	C eax	src low limb
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	cmpl	$4, %edx
+	pushl	%ebx
+deflit(`FRAME',4)
+
+	movl	PARAM_SRC, %ebx
+	jae	L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebp
+	pushl	%edi
+
+	mull	%eax		C src[0] ^ 2
+
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+
+	movl	4(%ebx), %eax
+	xorl	%ebp, %ebp
+
+	mull	%eax		C src[1] ^ 2
+
+	movl	%eax, 8(%ecx)
+	movl	%edx, 12(%ecx)
+
+	movl	8(%ebx), %eax
+	pushl	%esi		C risk of cache bank clash
+
+	mull	%eax		C src[2] ^ 2
+
+	movl	%eax, 16(%ecx)
+	movl	%edx, 20(%ecx)
+
+	movl	(%ebx), %eax
+
+	mull	4(%ebx)		C src[0] * src[1]
+
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	movl	(%ebx), %eax
+
+	mull	8(%ebx)		C src[0] * src[2]
+
+	addl	%eax, %edi
+	movl	%edx, %ebp
+
+	adcl	$0, %ebp
+	movl	4(%ebx), %eax
+
+	mull	8(%ebx)		C src[1] * src[2]
+
+	xorl	%ebx, %ebx
+	addl	%eax, %ebp
+
+	C eax
+	C ebx	zero, will be dst[5]
+	C ecx	dst
+	C edx	dst[4]
+	C esi	dst[1]
+	C edi	dst[2]
+	C ebp	dst[3]
+
+	adcl	$0, %edx
+	addl	%esi, %esi
+
+	adcl	%edi, %edi
+
+	adcl	%ebp, %ebp
+
+	adcl	%edx, %edx
+	movl	4(%ecx), %eax
+
+	adcl	$0, %ebx
+	addl	%esi, %eax
+
+	movl	%eax, 4(%ecx)
+	movl	8(%ecx), %eax
+
+	adcl	%edi, %eax
+	movl	12(%ecx), %esi
+
+	adcl	%ebp, %esi
+	movl	16(%ecx), %edi
+
+	movl	%eax, 8(%ecx)
+	movl	%esi, 12(%ecx)
+
+	adcl	%edx, %edi
+	popl	%esi
+
+	movl	20(%ecx), %eax
+	movl	%edi, 16(%ecx)
+
+	popl	%edi
+	popl	%ebp
+
+	adcl	%ebx, %eax	C no carry out of this
+	popl	%ebx
+
+	movl	%eax, 20(%ecx)
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(four_or_more):
+	C eax	src low limb
+	C ebx	src
+	C ecx	dst
+	C edx	size
+	C esi
+	C edi
+	C ebp
+	C
+	C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+	pushl	%edi
+FRAME_pushl()
+	pushl	%esi
+FRAME_pushl()
+
+	pushl	%ebp
+FRAME_pushl()
+	leal	(%ecx,%edx,4), %edi	C dst end of this mul1
+
+	leal	(%ebx,%edx,4), %esi	C src end
+	movl	%ebx, %ebp		C src
+
+	negl	%edx			C -size
+	xorl	%ebx, %ebx		C clear carry limb and carry flag
+
+	leal	1(%edx), %ecx		C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	src
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(mul1)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last two products, which are the end corner of the product
+	C triangle, are handled separately to save looping overhead.  These
+	C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+	C If size is 4 then it's only these that need to be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	adcl	$0, %ebx
+	movl	PARAM_SIZE, %edx
+
+	movl	%ebx, (%edi)
+	subl	$4, %edx
+
+	negl	%edx
+	jz	L(corner)
+
+
+L(outer):
+	C ebx	previous carry limb to store
+	C edx	outer loop counter (negative)
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+
+	pushl	%edx			C new outer loop counter
+	leal	-2(%edx), %ecx
+
+	movl	%ebx, (%edi)
+	addl	$4, %edi
+
+	addl	$4, %ebp
+	xorl	%ebx, %ebx		C initial carry limb, clear carry flag
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	&src[j]
+
+	adcl	$0, %ebx
+	movl	(%esi,%ecx,4), %eax
+
+	mull	(%ebp)
+
+	addl	%ebx, %eax
+	movl	(%edi,%ecx,4), %ebx
+
+	adcl	$0, %edx
+	addl	%eax, %ebx
+
+	movl	%ebx, (%edi,%ecx,4)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(inner)
+
+
+	adcl	$0, %ebx
+	popl	%edx		C outer loop counter
+
+	incl	%edx
+	jnz	L(outer)
+
+
+	movl	%ebx, (%edi)
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+
+	movl	-8(%esi), %eax
+	movl	-4(%edi), %ebx		C risk of data cache bank clash here
+
+	mull	-12(%esi)		C src[size-2]*src[size-3]
+
+	addl	%eax, %ebx
+	movl	%edx, %ecx
+
+	adcl	$0, %ecx
+	movl	-4(%esi), %eax
+
+	mull	-12(%esi)		C src[size-1]*src[size-3]
+
+	addl	%ecx, %eax
+	movl	(%edi), %ecx
+
+	adcl	$0, %edx
+	movl	%ebx, -4(%edi)
+
+	addl	%eax, %ecx
+	movl	%edx, %ebx
+
+	adcl	$0, %ebx
+	movl	-4(%esi), %eax
+
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+
+	movl	%ecx, (%edi)
+	addl	%eax, %ebx
+
+	adcl	$0, %edx
+	movl	PARAM_SIZE, %eax
+
+	negl	%eax
+	movl	%ebx, 4(%edi)
+
+	addl	$1, %eax		C -(size-1) and clear carry
+	movl	%edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	movl	12(%edi,%eax,8), %ebx
+
+	rcll	%ebx
+	movl	16(%edi,%eax,8), %ecx
+
+	rcll	%ecx
+	movl	%ebx, 12(%edi,%eax,8)
+
+	movl	%ecx, 16(%edi,%eax,8)
+	incl	%eax
+
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	PARAM_SRC, %esi
+
+	movl	PARAM_SIZE, %ecx	C risk of cache bank clash
+	movl	%eax, 12(%edi)		C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	(%esi), %eax		C src[0]
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx
+
+	mull	%eax
+
+	movl	%eax, 16(%edi,%ecx,8)	C dst[0]
+	movl	%edx, %ebx
+
+	addl	$1, %ecx		C size-1 and clear carry
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, negative
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-4]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	adcl	$0, %ebx
+
+	mull	%eax
+
+	movl	16-4(%edi,%ecx,8), %ebp
+
+	addl	%ebp, %ebx
+	movl	16(%edi,%ecx,8), %ebp
+
+	adcl	%eax, %ebp
+	movl	%ebx, 16-4(%edi,%ecx,8)
+
+	movl	%ebp, 16(%edi,%ecx,8)
+	incl	%ecx
+
+	movl	%edx, %ebx
+	jnz	L(diag)
+
+
+	adcl	$0, %edx
+	movl	16-4(%edi), %eax	C dst most significant limb
+
+	addl	%eax, %edx
+	popl	%ebp
+
+	movl	%edx, 16-4(%edi)
+	popl	%esi		C risk of cache bank clash
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
author	Thomas Voss <mail@thomasvoss.com>	2024-06-21 23:36:36 +0200
committer	Thomas Voss <mail@thomasvoss.com>	2024-06-21 23:42:26 +0200
commit	a89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
tree	b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/pentium
parent	1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)