aboutsummaryrefslogtreecommitdiff
path: root/vendor/gmp-6.3.0/mpn/x86/pentium
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-06-21 23:36:36 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-06-21 23:42:26 +0200
commita89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
treeb23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86/pentium
parent1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86/pentium')
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/README181
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm203
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm144
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm266
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/com.asm181
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm146
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm164
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm264
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h76
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm154
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm176
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm243
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h163
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm40
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm463
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm371
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm468
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm192
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm279
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm177
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm150
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm142
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm146
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm243
-rw-r--r--vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm528
25 files changed, 5560 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/README b/vendor/gmp-6.3.0/mpn/x86/pentium/README
new file mode 100644
index 0000000..305936b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/README
@@ -0,0 +1,181 @@
+Copyright 1996, 1999-2001, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+ INTEL PENTIUM P5 MPN SUBROUTINES
+
+
+This directory contains mpn functions optimized for Intel Pentium (P5,P54)
+processors. The mmx subdirectory has additional code for Pentium with MMX
+(P55).
+
+
+STATUS
+
+ cycles/limb
+
+ mpn_add_n/sub_n 2.375
+
+ mpn_mul_1 12.0
+ mpn_add/submul_1 14.0
+
+ mpn_mul_basecase 14.2 cycles/crossproduct (approx)
+
+ mpn_sqr_basecase 8 cycles/crossproduct (approx)
+ or 15.5 cycles/triangleproduct (approx)
+
+ mpn_l/rshift 5.375 normal (6.0 on P54)
+ 1.875 special shift by 1 bit
+
+ mpn_divrem_1 44.0
+ mpn_mod_1 28.0
+ mpn_divexact_by3 15.0
+
+ mpn_copyi/copyd 1.0
+
+Pentium MMX gets the following improvements
+
+ mpn_l/rshift 1.75
+
+ mpn_mul_1 12.0 normal, 7.0 for 16-bit multiplier
+
+
+mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop
+overhead and other delays (cache refill?), they run at or near 2.5
+cycles/limb.
+
+mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they
+should. Intel documentation says a mul instruction is 10 cycles, but it
+measures 9 and the routines using it run as 9.
+
+
+
+P55 MMX AND X87
+
+The cost of switching between MMX and x87 floating point on P55 is about 100
+cycles (fld1/por/emms for instance). In order to avoid that the two aren't
+mixed and currently that means using MMX and not x87.
+
+MMX offers a big speedup for lshift and rshift, and a nice speedup for
+16-bit multipliers in mpn_mul_1. If fast code using x87 is found then
+perhaps the preference for MMX will be reversed.
+
+
+
+
+P54 SHLDL
+
+mpn_lshift and mpn_rshift run at about 6 cycles/limb on P5 and P54, but the
+documentation indicates that they should take only 43/8 = 5.375 cycles/limb,
+or 5 cycles/limb asymptotically. The P55 runs them at the expected speed.
+
+It seems that on P54 a shldl or shrdl allows pairing in one following cycle,
+but not two. For example, back to back repetitions of the following
+
+ shldl( %cl, %eax, %ebx)
+ xorl %edx, %edx
+ xorl %esi, %esi
+
+run at 5 cycles, as expected, but repetitions of the following run at 7
+cycles, whereas 6 would be expected (and is achieved on P55),
+
+ shldl( %cl, %eax, %ebx)
+ xorl %edx, %edx
+ xorl %esi, %esi
+ xorl %edi, %edi
+ xorl %ebp, %ebp
+
+Three xorls run at 7 cycles too, so it doesn't seem to be just that pairing
+inhibited is only in the second following cycle (or something like that).
+
+Avoiding this problem would bring P54 shifts down from 6.0 c/l to 5.5 with a
+pattern of shift, 2 loads, shift, 2 stores, shift, etc. A start has been
+made on something like that, but it's not yet complete.
+
+
+
+
+OTHER NOTES
+
+Prefetching Destinations
+
+ Pentium doesn't allocate cache lines on writes, unlike most other modern
+ processors. Since the functions in the mpn class do array writes, we
+ have to handle allocating the destination cache lines by reading a word
+ from it in the loops, to achieve the best performance.
+
+Prefetching Sources
+
+ Prefetching of sources is pointless since there's no out-of-order loads.
+ Any load instruction blocks until the line is brought to L1, so it may
+ as well be the load that wants the data which blocks.
+
+Data Cache Bank Clashes
+
+ Pairing of memory operations requires that the two issued operations
+ refer to different cache banks (ie. different addresses modulo 32
+ bytes). The simplest way to ensure this is to read/write two words from
+ the same object. If we make operations on different objects, they might
+ or might not be to the same cache bank.
+
+PIC %eip Fetching
+
+ A simple call $+5 and popl can be used to get %eip, there's no need to
+ balance calls and returns since P5 doesn't have any return stack branch
+ prediction.
+
+Float Multiplies
+
+ fmul is pairable and can be issued every 2 cycles (with a 4 cycle
+ latency for data ready to use). This is a lot better than integer mull
+ or imull at 9 cycles non-pairing. Unfortunately the advantage is
+ quickly eaten away by needing to throw data through memory back to the
+ integer registers to adjust for fild and fist being signed, and to do
+ things like propagating carry bits.
+
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Manual", 1997, order number 242816. This
+is mostly about P5, the parts about P6 aren't relevant. Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm
new file mode 100644
index 0000000..01ebfb9
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/aors_n.asm
@@ -0,0 +1,203 @@
+dnl Intel Pentium mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 2.375 cycles/limb
+
+
+ifdef(`OPERATION_add_n',`
+ define(M4_inst, adcl)
+ define(M4_function_n, mpn_add_n)
+ define(M4_function_nc, mpn_add_nc)
+
+',`ifdef(`OPERATION_sub_n',`
+ define(M4_inst, sbbl)
+ define(M4_function_n, mpn_sub_n)
+ define(M4_function_nc, mpn_sub_nc)
+
+',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
+')')')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+
+C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size, mp_limb_t carry);
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(M4_function_nc)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%ebp
+ movl PARAM_SIZE,%ecx
+
+ movl (%ebp),%ebx
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx C zero carry flag
+ jz L(endgo)
+
+ pushl %edx
+FRAME_pushl()
+ movl PARAM_CARRY,%eax
+ shrl %eax C shift bit 0 into carry
+ jmp L(oop)
+
+L(endgo):
+deflit(`FRAME',16)
+ movl PARAM_CARRY,%eax
+ shrl %eax C shift bit 0 into carry
+ jmp L(end)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(M4_function_n)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC1,%esi
+ movl PARAM_SRC2,%ebp
+ movl PARAM_SIZE,%ecx
+
+ movl (%ebp),%ebx
+
+ decl %ecx
+ movl %ecx,%edx
+ shrl $3,%ecx
+ andl $7,%edx
+ testl %ecx,%ecx C zero carry flag
+ jz L(end)
+ pushl %edx
+FRAME_pushl()
+
+ ALIGN(8)
+L(oop): movl 28(%edi),%eax C fetch destination cache line
+ leal 32(%edi),%edi
+
+L(1): movl (%esi),%eax
+ movl 4(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 4(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 8(%ebp),%ebx
+ movl %eax,-32(%edi)
+ movl %edx,-28(%edi)
+
+L(2): movl 8(%esi),%eax
+ movl 12(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 12(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 16(%ebp),%ebx
+ movl %eax,-24(%edi)
+ movl %edx,-20(%edi)
+
+L(3): movl 16(%esi),%eax
+ movl 20(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 20(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 24(%ebp),%ebx
+ movl %eax,-16(%edi)
+ movl %edx,-12(%edi)
+
+L(4): movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ M4_inst %ebx,%eax
+ movl 28(%ebp),%ebx
+ M4_inst %ebx,%edx
+ movl 32(%ebp),%ebx
+ movl %eax,-8(%edi)
+ movl %edx,-4(%edi)
+
+ leal 32(%esi),%esi
+ leal 32(%ebp),%ebp
+ decl %ecx
+ jnz L(oop)
+
+ popl %edx
+FRAME_popl()
+L(end):
+ decl %edx C test %edx w/o clobbering carry
+ js L(end2)
+ incl %edx
+L(oop2):
+ leal 4(%edi),%edi
+ movl (%esi),%eax
+ M4_inst %ebx,%eax
+ movl 4(%ebp),%ebx
+ movl %eax,-4(%edi)
+ leal 4(%esi),%esi
+ leal 4(%ebp),%ebp
+ decl %edx
+ jnz L(oop2)
+L(end2):
+ movl (%esi),%eax
+ M4_inst %ebx,%eax
+ movl %eax,(%edi)
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm
new file mode 100644
index 0000000..d83cc45
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/aorsmul_1.asm
@@ -0,0 +1,144 @@
+dnl Intel Pentium mpn_addmul_1 -- mpn by limb multiplication.
+
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+ ALIGN(8)
+PROLOGUE(M4_function_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %ecx
+ pushl %esi FRAME_pushl()
+
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(M4_function_1)
+deflit(`FRAME',0)
+
+ xorl %ecx, %ecx
+ pushl %esi FRAME_pushl()
+
+L(start_1c):
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %eax
+
+ pushl %edi FRAME_pushl()
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ leal -1(%eax), %ebx C size-1
+
+ leal (%esi,%eax,4), %esi
+ xorl $-1, %ebx C -size, and clear carry
+
+ leal (%edi,%eax,4), %edi
+
+L(top):
+ C eax
+ C ebx counter, negative
+ C ecx carry
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp
+
+ adcl $0, %ecx
+ movl (%esi,%ebx,4), %eax
+
+ mull PARAM_MULTIPLIER
+
+ addl %ecx, %eax
+ movl (%edi,%ebx,4), %ecx
+
+ adcl $0, %edx
+ M4_inst %eax, %ecx
+
+ movl %ecx, (%edi,%ebx,4)
+ incl %ebx
+
+ movl %edx, %ecx
+ jnz L(top)
+
+
+ adcl $0, %ecx
+ popl %ebx
+
+ movl %ecx, %eax
+ popl %edi
+
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm
new file mode 100644
index 0000000..c2c4f58
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/bdiv_q_1.asm
@@ -0,0 +1,266 @@
+dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Rearranged from mpn/x86/pentium/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2011, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C divisor
+C odd even
+C P54: 24.5 30.5 cycles/limb
+C P55: 23.0 28.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected. On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(32)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl $-1, %ecx
+ movl PARAM_DIVISOR, %eax
+
+L(strip_twos):
+ ASSERT(nz, `orl %eax, %eax')
+ shrl %eax
+ incl %ecx C shift count
+
+ jnc L(strip_twos)
+
+ leal 1(%eax,%eax), %edx C d
+ andl $127, %eax C d/2, 7 bits
+
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %eax
+',`
+ call L(here)
+L(here):
+ popl %ebp C eip
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+ C AGI
+ movl binvert_limb_table@GOT(%ebp), %ebp
+ C AGI
+ movzbl (%eax,%ebp), %eax
+')
+',`
+
+dnl non-PIC
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ movl %eax, %ebp C inv
+ addl %eax, %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ movl PARAM_SIZE, %ebx
+
+ movl %eax, %ebp
+ addl %eax, %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ movl %edx, PARAM_DIVISOR C d without twos
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ jmp L(common)
+EPILOGUE()
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+ ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SHIFT, %ecx
+
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_SIZE, %ebx
+ movl PARAM_INVERSE, %eax
+
+L(common):
+ pushl %esi FRAME_pushl()
+ push %edi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+ movl %eax, VAR_INVERSE
+
+ leal (%esi,%ebx,4), %esi C src end
+ leal (%edi,%ebx,4), %edi C dst end
+
+ negl %ebx C -size
+
+ xorl %ebp, %ebp C initial carry bit
+
+ orl %ecx, %ecx C shift
+ movl (%esi,%ebx,4), %eax C src low limb
+ jz L(odd_entry)
+
+ xorl %edx, %edx C initial carry limb (for even, if one)
+ incl %ebx
+ jz L(one)
+
+ movl (%esi,%ebx,4), %edx C src second limb (for even)
+ shrdl( %cl, %edx, %eax)
+
+ jmp L(even_entry)
+
+
+ ALIGN(8)
+L(odd_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ebp, %edx
+
+ subl %edx, %eax
+
+ sbbl %ebp, %ebp
+
+L(odd_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, (%edi,%ebx,4)
+
+ incl %ebx
+ jnz L(odd_top)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ ret
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx twos
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ mull PARAM_DIVISOR
+
+ subl %ebp, %edx C carry bit
+ movl -4(%esi,%ebx,4), %eax C src limb
+
+ movl (%esi,%ebx,4), %ebp C and one above it
+
+ shrdl( %cl, %ebp, %eax)
+
+ subl %edx, %eax C carry limb
+
+ sbbl %ebp, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+
+ jnz L(even_top)
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax C src high limb
+ subl %ebp, %edx
+
+L(one):
+ shrl %cl, %eax
+
+ subl %edx, %eax C no carry if division is exact
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi) C dst high limb
+ nop C protect against cache bank clash
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm
new file mode 100644
index 0000000..b080545
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/com.asm
@@ -0,0 +1,181 @@
+dnl Intel Pentium mpn_com -- mpn ones complement.
+
+dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb
+
+
+NAILS_SUPPORT(0-31)
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C This code is similar to mpn_copyi, basically there's just some "xorl
+C $GMP_NUMB_MASK"s inserted.
+C
+C Alternatives:
+C
+C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst
+C are the same alignment mod 8, but it doesn't seem worth the trouble for
+C just that case (there'd need to be some plain integer available too for
+C the unaligned case).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %eax
+ movl PARAM_SIZE, %ecx
+
+ pushl %esi FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ leal (%eax,%ecx,4), %eax
+ xorl $-1, %ecx C -size-1
+
+ movl PARAM_DST, %edx
+ addl $8, %ecx C -size+7
+
+ jns L(end)
+
+ movl (%edx), %esi C fetch destination cache line
+ nop
+
+L(top):
+ C eax &src[size]
+ C ebx
+ C ecx counter, limbs, negative
+ C edx dst, incrementing
+ C esi scratch
+ C edi scratch
+ C ebp
+
+ movl 28(%edx), %esi C destination prefetch
+ addl $32, %edx
+
+ movl -28(%eax,%ecx,4), %esi
+ movl -24(%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, -32(%edx)
+ movl %edi, -28(%edx)
+
+ movl -20(%eax,%ecx,4), %esi
+ movl -16(%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, -24(%edx)
+ movl %edi, -20(%edx)
+
+ movl -12(%eax,%ecx,4), %esi
+ movl -8(%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, -16(%edx)
+ movl %edi, -12(%edx)
+
+ movl -4(%eax,%ecx,4), %esi
+ movl (%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, -8(%edx)
+ movl %edi, -4(%edx)
+
+ addl $8, %ecx
+ js L(top)
+
+
+L(end):
+ C eax &src[size]
+ C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
+ C edx dst, next location to store
+
+ subl $4, %ecx
+ nop
+
+ jns L(no4)
+
+ movl -12(%eax,%ecx,4), %esi
+ movl -8(%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, (%edx)
+ movl %edi, 4(%edx)
+
+ movl -4(%eax,%ecx,4), %esi
+ movl (%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, 8(%edx)
+ movl %edi, 12(%edx)
+
+ addl $16, %edx
+ addl $4, %ecx
+L(no4):
+
+ subl $2, %ecx
+ nop
+
+ jns L(no2)
+
+ movl -4(%eax,%ecx,4), %esi
+ movl (%eax,%ecx,4), %edi
+ xorl $GMP_NUMB_MASK, %esi
+ xorl $GMP_NUMB_MASK, %edi
+ movl %esi, (%edx)
+ movl %edi, 4(%edx)
+
+ addl $8, %edx
+ addl $2, %ecx
+L(no2):
+
+ popl %edi
+ jnz L(done)
+
+ movl -4(%eax), %ecx
+
+ xorl $GMP_NUMB_MASK, %ecx
+ popl %esi
+
+ movl %ecx, (%edx)
+ ret
+
+L(done):
+ popl %esi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm
new file mode 100644
index 0000000..72a543b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/copyd.asm
@@ -0,0 +1,146 @@
+dnl Intel Pentium mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C See comments in copyi.asm.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+ movl PARAM_SRC, %eax
+ movl PARAM_SIZE, %ecx
+
+ pushl %esi FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ leal -4(%eax,%ecx,4), %eax C &src[size-1]
+ movl PARAM_DST, %edx
+
+ subl $7, %ecx C size-7
+ jle L(end)
+
+ movl 28-4(%edx,%ecx,4), %esi C prefetch cache, dst[size-1]
+ nop
+
+L(top):
+ C eax src, decrementing
+ C ebx
+ C ecx counter, limbs
+ C edx dst
+ C esi scratch
+ C edi scratch
+ C ebp
+
+ movl 28-32(%edx,%ecx,4), %esi C prefetch dst cache line
+ subl $8, %ecx
+
+ movl (%eax), %esi C read words pairwise
+ movl -4(%eax), %edi
+ movl %esi, 56(%edx,%ecx,4) C store words pairwise
+ movl %edi, 52(%edx,%ecx,4)
+
+ movl -8(%eax), %esi
+ movl -12(%eax), %edi
+ movl %esi, 48(%edx,%ecx,4)
+ movl %edi, 44(%edx,%ecx,4)
+
+ movl -16(%eax), %esi
+ movl -20(%eax), %edi
+ movl %esi, 40(%edx,%ecx,4)
+ movl %edi, 36(%edx,%ecx,4)
+
+ movl -24(%eax), %esi
+ movl -28(%eax), %edi
+ movl %esi, 32(%edx,%ecx,4)
+ movl %edi, 28(%edx,%ecx,4)
+
+ leal -32(%eax), %eax
+ jg L(top)
+
+
+L(end):
+ C ecx -7 to 0, representing respectively 0 to 7 limbs remaining
+ C eax src end
+ C edx dst, next location to store
+
+ addl $4, %ecx
+ jle L(no4)
+
+ movl (%eax), %esi
+ movl -4(%eax), %edi
+ movl %esi, 8(%edx,%ecx,4)
+ movl %edi, 4(%edx,%ecx,4)
+
+ movl -8(%eax), %esi
+ movl -12(%eax), %edi
+ movl %esi, (%edx,%ecx,4)
+ movl %edi, -4(%edx,%ecx,4)
+
+ subl $16, %eax
+ subl $4, %ecx
+L(no4):
+
+ addl $2, %ecx
+ jle L(no2)
+
+ movl (%eax), %esi
+ movl -4(%eax), %edi
+ movl %esi, (%edx,%ecx,4)
+ movl %edi, -4(%edx,%ecx,4)
+
+ subl $8, %eax
+ subl $2, %ecx
+L(no2):
+
+ jnz L(done)
+
+ movl (%eax), %ecx
+ movl %ecx, (%edx) C risk of cache bank clash here
+
+L(done):
+ popl %edi
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm
new file mode 100644
index 0000000..d983d6b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/copyi.asm
@@ -0,0 +1,164 @@
+dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing.
+
+dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Destination prefetching is done to avoid repeated write-throughs on lines
+C not already in L1.
+C
+C At least one of the src or dst pointer needs to be incremented rather than
+C using indexing, so that there's somewhere to put the loop control without
+C an AGI. Incrementing one and not two lets us keep loop overhead to 2
+C cycles. Making it the src pointer incremented avoids an AGI on the %ecx
+C subtracts in the finishup code.
+C
+C The block of finishup code is almost as big as the main loop itself, which
+C is unfortunate, but it's faster that way than with say rep movsl, by about
+C 10 cycles for instance on P55.
+C
+C There's nothing to be gained from MMX on P55, since it can do only one
+C movq load (or store) per cycle, so the throughput would be the same as the
+C code here (and even then only if src and dst have the same alignment mod
+C 8).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_DST, %edx
+
+ pushl %ebx FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ leal (%edx,%ecx,4), %edx C &dst[size-1]
+ xorl $-1, %ecx C -size-1
+
+ movl PARAM_SRC, %esi
+ addl $8, %ecx C -size+7
+
+ jns L(end)
+
+ movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0]
+ nop
+
+L(top):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, limbs, negative
+ C edx &dst[size-1]
+ C esi src, incrementing
+ C edi
+ C ebp
+
+ movl (%edx,%ecx,4), %eax C fetch destination cache line
+ addl $8, %ecx
+
+ movl (%esi), %eax C read words pairwise
+ movl 4(%esi), %ebx
+ movl %eax, -60(%edx,%ecx,4) C store words pairwise
+ movl %ebx, -56(%edx,%ecx,4)
+
+ movl 8(%esi), %eax
+ movl 12(%esi), %ebx
+ movl %eax, -52(%edx,%ecx,4)
+ movl %ebx, -48(%edx,%ecx,4)
+
+ movl 16(%esi), %eax
+ movl 20(%esi), %ebx
+ movl %eax, -44(%edx,%ecx,4)
+ movl %ebx, -40(%edx,%ecx,4)
+
+ movl 24(%esi), %eax
+ movl 28(%esi), %ebx
+ movl %eax, -36(%edx,%ecx,4)
+ movl %ebx, -32(%edx,%ecx,4)
+
+ leal 32(%esi), %esi
+ js L(top)
+
+
+L(end):
+ C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
+ C esi src end
+ C edx dst, next location to store
+
+ subl $4, %ecx
+ jns L(no4)
+
+ movl (%esi), %eax
+ movl 4(%esi), %ebx
+ movl %eax, -12(%edx,%ecx,4)
+ movl %ebx, -8(%edx,%ecx,4)
+
+ movl 8(%esi), %eax
+ movl 12(%esi), %ebx
+ movl %eax, -4(%edx,%ecx,4)
+ movl %ebx, (%edx,%ecx,4)
+
+ addl $16, %esi
+ addl $4, %ecx
+L(no4):
+
+ subl $2, %ecx
+ jns L(no2)
+
+ movl (%esi), %eax
+ movl 4(%esi), %ebx
+ movl %eax, -4(%edx,%ecx,4)
+ movl %ebx, (%edx,%ecx,4)
+
+ addl $8, %esi
+ addl $2, %ecx
+L(no2):
+
+ jnz L(done)
+
+ movl (%esi), %eax
+ movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here
+
+L(done):
+ popl %esi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm
new file mode 100644
index 0000000..21b5287
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/dive_1.asm
@@ -0,0 +1,264 @@
+dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C divisor
+C odd even
+C P54: 24.5 30.5 cycles/limb
+C P55: 23.0 28.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C Plain divl is used for small sizes, since the inverse takes a while to
+C setup. Multiplying works out faster for size>=3 when the divisor is odd,
+C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or
+C size==3 for even are about the same speed for both divl or mul, but the
+C former is used since it will use up less code cache.
+C
+C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
+C expected. On P54 in the even case the shrdl pairing nonsense (see
+C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
+C further 1.5 slowdown for both odd and even.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ movl PARAM_SIZE, %ecx
+
+ pushl %esi FRAME_pushl()
+ push %edi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ andl $1, %eax
+
+ movl PARAM_DST, %edi
+ addl %ecx, %eax C size if even, size+1 if odd
+
+ cmpl $4, %eax
+ jae L(mul_by_inverse)
+
+
+ xorl %edx, %edx
+L(div_top):
+ movl -4(%esi,%ecx,4), %eax
+
+ divl PARAM_DIVISOR
+
+ movl %eax, -4(%edi,%ecx,4)
+ decl %ecx
+
+ jnz L(div_top)
+
+ popl %edi
+ popl %esi
+
+ ret
+
+
+
+L(mul_by_inverse):
+ movl PARAM_DIVISOR, %eax
+ movl $-1, %ecx
+
+L(strip_twos):
+ ASSERT(nz, `orl %eax, %eax')
+ shrl %eax
+ incl %ecx C shift count
+
+ jnc L(strip_twos)
+
+ leal 1(%eax,%eax), %edx C d
+ andl $127, %eax C d/2, 7 bits
+
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ifdef(`PIC',`dnl
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ movl %eax, %ebp C inv
+ addl %eax, %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ movl PARAM_SIZE, %ebx
+
+ movl %eax, %ebp
+ addl %eax, %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ movl %edx, PARAM_DIVISOR C d without twos
+
+ leal (%esi,%ebx,4), %esi C src end
+ leal (%edi,%ebx,4), %edi C dst end
+
+ negl %ebx C -size
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ movl %eax, VAR_INVERSE
+ xorl %ebp, %ebp C initial carry bit
+
+ movl (%esi,%ebx,4), %eax C src low limb
+ orl %ecx, %ecx C shift
+
+ movl 4(%esi,%ebx,4), %edx C src second limb (for even)
+ jz L(odd_entry)
+
+ shrdl( %cl, %edx, %eax)
+
+ incl %ebx
+ jmp L(even_entry)
+
+
+ ALIGN(8)
+L(odd_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ebp, %edx
+
+ subl %edx, %eax
+
+ sbbl %ebp, %ebp
+
+L(odd_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, (%edi,%ebx,4)
+
+ incl %ebx
+ jnz L(odd_top)
+
+
+ popl %ebp
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx twos
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ mull PARAM_DIVISOR
+
+ subl %ebp, %edx C carry bit
+ movl -4(%esi,%ebx,4), %eax C src limb
+
+ movl (%esi,%ebx,4), %ebp C and one above it
+
+ shrdl( %cl, %ebp, %eax)
+
+ subl %edx, %eax C carry limb
+
+ sbbl %ebp, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+
+ jnz L(even_top)
+
+
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax C src high limb
+ subl %ebp, %edx
+
+ shrl %cl, %eax
+
+ subl %edx, %eax C no carry if division is exact
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi) C dst high limb
+ nop C protect against cache bank clash
+
+ popl %ebp
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h
new file mode 100644
index 0000000..befa6e2
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/gmp-mparam.h
@@ -0,0 +1,76 @@
+/* Intel P54 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 166MHz P54 */
+
+/* Generated by tuneup.c, 2004-02-10, gcc 2.95 */
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 90
+
+#define SQR_BASECASE_THRESHOLD 0 /* always */
+#define SQR_TOOM2_THRESHOLD 22
+#define SQR_TOOM3_THRESHOLD 122
+
+#define DIV_SB_PREINV_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_DC_THRESHOLD 52
+#define POWM_THRESHOLD 77
+
+#define HGCD_THRESHOLD 121
+#define GCD_ACCEL_THRESHOLD 3
+#define GCD_DC_THRESHOLD 615
+#define JACOBI_BASE_METHOD 2
+
+#define USE_PREINV_DIVREM_1 0
+#define USE_PREINV_MOD_1 1 /* native */
+#define DIVREM_2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define MODEXACT_1_ODD_THRESHOLD 0 /* always (native) */
+
+#define GET_STR_DC_THRESHOLD 23
+#define GET_STR_PRECOMPUTE_THRESHOLD 33
+#define SET_STR_THRESHOLD 2788
+
+#define MUL_FFT_TABLE { 432, 928, 1664, 3584, 10240, 40960, 0 }
+#define MUL_FFT_MODF_THRESHOLD 448
+#define MUL_FFT_THRESHOLD 3328
+
+#define SQR_FFT_TABLE { 496, 928, 1920, 4608, 10240, 40960, 0 }
+#define SQR_FFT_MODF_THRESHOLD 512
+#define SQR_FFT_THRESHOLD 3328
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm
new file mode 100644
index 0000000..6c6c1a1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/hamdist.asm
@@ -0,0 +1,154 @@
+dnl Intel P5 mpn_hamdist -- mpn hamming distance.
+
+dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.0 cycles/limb
+
+
+C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
+C
+C It might be possible to shave 1 cycle from the loop, and hence 2
+C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor
+C would be 1, if the right schedule could be found (not found so far).
+C Wanting to avoid potential cache bank clashes makes it tricky.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: referencing popcount.asm's table is incorrect as it hurt incremental
+C linking.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC1, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_hamdist)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %esi FRAME_pushl()
+
+ shll %ecx C size in byte pairs
+ pushl %edi FRAME_pushl()
+
+ifdef(`PIC',`
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+ifdef(`DARWIN',`
+ movl PARAM_SRC1, %esi
+ movl PARAM_SRC2, %edi
+ LEA( TABLE_NAME, %ebp)
+ xorl %ebx, %ebx C byte
+ xorl %edx, %edx C byte
+ xorl %eax, %eax C total
+',`
+ call L(here) FRAME_pushl()
+L(here):
+ movl PARAM_SRC1, %esi
+ popl %ebp FRAME_popl()
+
+ movl PARAM_SRC2, %edi
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+
+ xorl %ebx, %ebx C byte
+ xorl %edx, %edx C byte
+
+ movl TABLE_NAME@GOT(%ebp), %ebp
+ xorl %eax, %eax C total
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+ movl PARAM_SRC1, %esi
+ movl PARAM_SRC2, %edi
+
+ xorl %eax, %eax C total
+ pushl %ebx FRAME_pushl()
+
+ xorl %edx, %edx C byte
+ xorl %ebx, %ebx C byte
+
+define(TABLE,`TABLE_NAME($1)')
+')
+
+
+ C The nop after the xorb seems necessary. Although a movb might be
+ C expected to go down the V pipe in the second cycle of the xorb, it
+ C doesn't and costs an extra 2 cycles.
+L(top):
+ C eax total
+ C ebx byte
+ C ecx counter, 2*size to 2
+ C edx byte
+ C esi src1
+ C edi src2
+ C ebp [PIC] table
+
+ addl %ebx, %eax
+ movb -1(%esi,%ecx,2), %bl
+
+ addl %edx, %eax
+ movb -1(%edi,%ecx,2), %dl
+
+ xorb %dl, %bl
+ movb -2(%esi,%ecx,2), %dl
+
+ xorb -2(%edi,%ecx,2), %dl
+ nop
+
+ movb TABLE(%ebx), %bl
+ decl %ecx
+
+ movb TABLE(%edx), %dl
+ jnz L(top)
+
+
+ifdef(`PIC',`
+ popl %ebp
+')
+ addl %ebx, %eax
+ popl %ebx
+
+ addl %edx, %eax
+ popl %edi
+
+ popl %esi
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm
new file mode 100644
index 0000000..1877317
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/logops_n.asm
@@ -0,0 +1,176 @@
+dnl Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 3.0 c/l and, ior, xor
+C 3.5 c/l andn, iorn, nand, nior, xnor
+
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4op', `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n, , andl, )
+M4_choose_op( andn_n, , andl, yes)
+M4_choose_op( nand_n, yes, andl, )
+M4_choose_op( ior_n, , orl, )
+M4_choose_op( iorn_n, , orl, yes)
+M4_choose_op( nior_n, yes, orl, )
+M4_choose_op( xor_n, , xorl, )
+M4_choose_op( xnor_n, yes, xorl, )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+NAILS_SUPPORT(0-31)
+
+
+C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size);
+C
+C Nothing complicated here, just some care to avoid data cache bank clashes
+C and AGIs.
+C
+C We're one register short of being able to do a simple 4 loads, 2 ops, 2
+C stores. Instead %ebp is juggled a bit and nops are introduced to keep the
+C pairings as intended. An in-place operation would free up a register, for
+C an 0.5 c/l speedup, if that's worth bothering with.
+C
+C This code seems best for P55 too. Data alignment is a big problem for MMX
+C and the pairing restrictions on movq and integer instructions make life
+C difficult.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_YP, 12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ pushl %ebx FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ pushl %edi FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_XP, %ebx
+
+ movl PARAM_YP, %esi
+ movl PARAM_WP, %edi
+
+ shrl %ecx
+ jnc L(entry)
+
+ movl (%ebx,%ecx,8), %eax C risk of data cache bank clash here
+ movl (%esi,%ecx,8), %edx
+
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(%edx)')
+
+ M4op %edx, %eax
+
+M4post(`xorl $GMP_NUMB_MASK, %eax')
+ orl %ecx, %ecx
+
+ movl %eax, (%edi,%ecx,8)
+ jz L(done)
+
+ jmp L(entry)
+
+
+L(top):
+ C eax
+ C ebx xp
+ C ecx counter, limb pairs, decrementing
+ C edx
+ C esi yp
+ C edi wp
+ C ebp
+
+ M4op %ebp, %edx
+ nop
+
+M4post(`xorl $GMP_NUMB_MASK, %eax')
+M4post(`xorl $GMP_NUMB_MASK, %edx')
+
+ movl %eax, 4(%edi,%ecx,8)
+ movl %edx, (%edi,%ecx,8)
+
+L(entry):
+ movl -4(%ebx,%ecx,8), %ebp
+ nop
+
+ movl -4(%esi,%ecx,8), %eax
+ movl -8(%esi,%ecx,8), %edx
+
+M4pre(` xorl $GMP_NUMB_MASK, %eax')
+M4pre(` xorl $GMP_NUMB_MASK, %edx')
+
+ M4op %ebp, %eax
+ movl -8(%ebx,%ecx,8), %ebp
+
+ decl %ecx
+ jnz L(top)
+
+
+ M4op %ebp, %edx
+ nop
+
+M4post(`xorl $GMP_NUMB_MASK, %eax')
+M4post(`xorl $GMP_NUMB_MASK, %edx')
+
+ movl %eax, 4(%edi,%ecx,8)
+ movl %edx, (%edi,%ecx,8)
+
+
+L(done):
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm
new file mode 100644
index 0000000..2a31f36
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/lshift.asm
@@ -0,0 +1,243 @@
+dnl Intel Pentium mpn_lshift -- mpn left shift.
+
+dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5,P54: 6.0
+C P55: 5.375
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_lshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ebp
+ movl PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%esi),%eax
+ cmpl %edi,%eax
+ jnc L(special) C jump if s_ptr + 1 >= res_ptr
+ leal (%esi,%ebp,4),%eax
+ cmpl %eax,%edi
+ jnc L(special) C jump if res_ptr >= s_ptr + size
+
+L(normal):
+ leal -4(%edi,%ebp,4),%edi
+ leal -4(%esi,%ebp,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+ xorl %eax,%eax
+ shldl( %cl, %edx, %eax) C compute carry limb
+ pushl %eax C push carry limb onto stack
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+ jz L(end)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(oop): movl -28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ shldl( %cl, %eax, %ebx)
+ shldl( %cl, %edx, %eax)
+ movl %ebx,(%edi)
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebx
+ movl -12(%esi),%eax
+ shldl( %cl, %ebx, %edx)
+ shldl( %cl, %eax, %ebx)
+ movl %edx,-8(%edi)
+ movl %ebx,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebx
+ shldl( %cl, %edx, %eax)
+ shldl( %cl, %ebx, %edx)
+ movl %eax,-16(%edi)
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ shldl( %cl, %eax, %ebx)
+ shldl( %cl, %edx, %eax)
+ movl %ebx,-24(%edi)
+ movl %eax,-28(%edi)
+
+ subl $32,%esi
+ subl $32,%edi
+ decl %ebp
+ jnz L(oop)
+
+L(end): popl %ebp
+ andl $7,%ebp
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shldl( %cl,%eax,%edx)
+ movl %edx,(%edi)
+ movl %eax,%edx
+ subl $4,%esi
+ subl $4,%edi
+ decl %ebp
+ jnz L(oop2)
+
+L(end2):
+ shll %cl,%edx C compute least significant limb
+ movl %edx,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+ movl (%esi),%edx
+ addl $4,%esi
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+
+ addl %edx,%edx
+ incl %ebp
+ decl %ebp
+ jz L(Lend)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(Loop):
+ movl 28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ adcl %eax,%eax
+ movl %ebx,(%edi)
+ adcl %edx,%edx
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebx
+ movl 12(%esi),%eax
+ adcl %ebx,%ebx
+ movl %edx,8(%edi)
+ adcl %eax,%eax
+ movl %ebx,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ adcl %edx,%edx
+ movl %eax,16(%edi)
+ adcl %ebx,%ebx
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ adcl %eax,%eax
+ movl %ebx,24(%edi)
+ adcl %edx,%edx
+ movl %eax,28(%edi)
+
+ leal 32(%esi),%esi C use leal not to clobber carry
+ leal 32(%edi),%edi
+ decl %ebp
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebp
+ sbbl %eax,%eax C save carry in %eax
+ andl $7,%ebp
+ jz L(Lend2)
+ addl %eax,%eax C restore carry from eax
+L(Loop2):
+ movl %edx,%ebx
+ movl (%esi),%edx
+ adcl %edx,%edx
+ movl %ebx,(%edi)
+
+ leal 4(%esi),%esi C use leal not to clobber carry
+ leal 4(%edi),%edi
+ decl %ebp
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax C restore carry from eax
+L(L1): movl %edx,(%edi) C store last limb
+
+ sbbl %eax,%eax
+ negl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h
new file mode 100644
index 0000000..02a0def
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/gmp-mparam.h
@@ -0,0 +1,163 @@
+/* Intel P55 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2002, 2004, 2009, 2010 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* For mpn/x86/pentium/mod_1.asm */
+#define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
+
+
+/* 233MHz P55 */
+
+#define MOD_1_NORM_THRESHOLD 5
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 12
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 11
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 63
+#define USE_PREINV_DIVREM_1 0
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 51
+
+#define MUL_TOOM22_THRESHOLD 16
+#define MUL_TOOM33_THRESHOLD 53
+#define MUL_TOOM44_THRESHOLD 128
+#define MUL_TOOM6H_THRESHOLD 189
+#define MUL_TOOM8H_THRESHOLD 260
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 89
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 91
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 90
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 88
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 20
+#define SQR_TOOM3_THRESHOLD 73
+#define SQR_TOOM4_THRESHOLD 178
+#define SQR_TOOM6_THRESHOLD 210
+#define SQR_TOOM8_THRESHOLD 375
+
+#define MULMOD_BNM1_THRESHOLD 11
+#define SQRMOD_BNM1_THRESHOLD 12
+
+#define MUL_FFT_MODF_THRESHOLD 364 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 364, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 17, 7}, { 9, 6}, \
+ { 21, 7}, { 11, 6}, { 23, 7}, { 15, 6}, \
+ { 31, 7}, { 21, 8}, { 11, 7}, { 27, 8}, \
+ { 15, 7}, { 33, 8}, { 19, 7}, { 39, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \
+ { 47,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159, 8}, { 319, 9}, { 167,10}, \
+ { 95, 9}, { 191, 8}, { 383,11}, { 63,10}, \
+ { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \
+ { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \
+ { 383,12}, { 63,11}, { 127,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 351,11}, \
+ { 191,10}, { 415,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 511,11}, { 287,10}, { 575,11}, \
+ { 351,12}, { 191,11}, { 415,13}, { 127,12}, \
+ { 255,11}, { 575,12}, { 319,11}, { 703,12}, \
+ { 383,11}, { 831,12}, { 447,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 90
+#define MUL_FFT_THRESHOLD 3520
+
+#define SQR_FFT_MODF_THRESHOLD 340 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 340, 5}, { 17, 6}, { 9, 5}, { 19, 6}, \
+ { 17, 7}, { 9, 6}, { 21, 7}, { 11, 6}, \
+ { 23, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 29, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 27, 7}, { 55, 9}, \
+ { 15, 8}, { 31, 7}, { 65, 8}, { 43, 9}, \
+ { 23, 8}, { 47,10}, { 15, 9}, { 31, 8}, \
+ { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
+ { 95,10}, { 31, 9}, { 63, 8}, { 127, 9}, \
+ { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \
+ { 63, 9}, { 127, 8}, { 255, 9}, { 135,10}, \
+ { 79, 9}, { 159, 8}, { 319,10}, { 95, 9}, \
+ { 191,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \
+ { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \
+ { 63,11}, { 127,10}, { 271, 9}, { 543,10}, \
+ { 287, 9}, { 575,10}, { 303,11}, { 159,10}, \
+ { 351,11}, { 191,10}, { 415,11}, { 223,10}, \
+ { 447,12}, { 127,11}, { 255,10}, { 543,11}, \
+ { 287,10}, { 607,11}, { 351,12}, { 191,11}, \
+ { 479,13}, { 127,12}, { 255,11}, { 575,12}, \
+ { 319,11}, { 703,12}, { 383,11}, { 767,12}, \
+ { 447,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 96
+#define SQR_FFT_THRESHOLD 5504
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 48
+#define MULLO_MUL_N_THRESHOLD 6633
+
+#define DC_DIV_QR_THRESHOLD 43
+#define DC_DIVAPPR_Q_THRESHOLD 170
+#define DC_BDIV_QR_THRESHOLD 43
+#define DC_BDIV_Q_THRESHOLD 110
+
+#define INV_MULMOD_BNM1_THRESHOLD 30
+#define INV_NEWTON_THRESHOLD 177
+#define INV_APPR_THRESHOLD 171
+
+#define BINV_NEWTON_THRESHOLD 194
+#define REDC_1_TO_REDC_N_THRESHOLD 50
+
+#define MU_DIV_QR_THRESHOLD 1142
+#define MU_DIVAPPR_Q_THRESHOLD 1142
+#define MUPI_DIV_QR_THRESHOLD 90
+#define MU_BDIV_QR_THRESHOLD 942
+#define MU_BDIV_Q_THRESHOLD 1017
+
+#define MATRIX22_STRASSEN_THRESHOLD 13
+#define HGCD_THRESHOLD 92
+#define GCD_DC_THRESHOLD 283
+#define GCDEXT_DC_THRESHOLD 221
+#define JACOBI_BASE_METHOD 2
+
+#define GET_STR_DC_THRESHOLD 18
+#define GET_STR_PRECOMPUTE_THRESHOLD 31
+#define SET_STR_DC_THRESHOLD 490
+#define SET_STR_PRECOMPUTE_THRESHOLD 994
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm
new file mode 100644
index 0000000..72e3196
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/hamdist.asm
@@ -0,0 +1,40 @@
+dnl Intel P55 mpn_hamdist -- mpn hamming distance.
+
+dnl Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P55: hamdist 12.0 cycles/limb
+
+C For reference, this code runs at 11.5 cycles/limb for popcount, which is
+C slower than the plain integer mpn/x86/pentium/popcount.asm.
+
+MULFUNC_PROLOGUE(mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm
new file mode 100644
index 0000000..04b0ddc
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/lshift.asm
@@ -0,0 +1,463 @@
+dnl Intel P5 mpn_lshift -- mpn left shift.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size left by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the right. Return the bits shifted out at the
+C left.
+C
+C The comments in mpn_rshift apply here too.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl minimum 5, because the unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_lshift)
+
+ pushl %ebx
+ pushl %edi
+deflit(`FRAME',8)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edx
+
+ movl PARAM_SRC, %ebx
+ movl PARAM_SHIFT, %ecx
+
+ cmp $UNROLL_THRESHOLD, %eax
+ jae L(unroll)
+
+ movl -4(%ebx,%eax,4), %edi C src high limb
+ decl %eax
+
+ jnz L(simple)
+
+ shldl( %cl, %edi, %eax) C eax was decremented to zero
+
+ shll %cl, %edi
+
+ movl %edi, (%edx) C dst low limb
+ popl %edi C risk of data cache bank clash
+
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(simple):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx,%eax,4), %mm5 C src high limb
+
+ movd %ecx, %mm6 C lshift
+ negl %ecx
+
+ psllq %mm6, %mm5
+ addl $32, %ecx
+
+ movd %ecx, %mm7
+ psrlq $32, %mm5 C retval
+
+
+L(simple_top):
+ C eax counter, limbs, negative
+ C ebx src
+ C ecx
+ C edx dst
+ C esi
+ C edi
+ C
+ C mm0 scratch
+ C mm5 return value
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ C
+
+ movd %mm0, 4(%edx,%eax,4)
+ jnz L(simple_top)
+
+
+ movd (%ebx), %mm0
+
+ movd %mm5, %eax
+ psllq %mm6, %mm0
+
+ popl %edi
+ popl %ebx
+
+ movd %mm0, (%edx)
+
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(unroll):
+ C eax size
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd -4(%ebx,%eax,4), %mm5 C src high limb
+ leal (%ebx,%eax,4), %edi
+
+ movd %ecx, %mm6 C lshift
+ andl $4, %edi
+
+ psllq %mm6, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process high limb separately (marked xxx) to
+ C make it so.
+ C
+ C source -8(ebx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+ C
+ C dest
+ C -4(edx,%eax,4)
+ C |
+ C +-------+-------+--
+ C | xxx | |
+ C +-------+-------+--
+
+ movq -8(%ebx,%eax,4), %mm0 C unaligned load
+
+ psllq %mm6, %mm0
+ decl %eax
+
+ psrlq $32, %mm0
+
+ C
+
+ movd %mm0, (%edx,%eax,4)
+L(start_src_aligned):
+
+ movq -8(%ebx,%eax,4), %mm1 C src high qword
+ leal (%edx,%eax,4), %edi
+
+ andl $4, %edi
+ psrlq $32, %mm5 C return value
+
+ movq -16(%ebx,%eax,4), %mm3 C src second highest qword
+ jz L(start_dst_aligned)
+
+ C dst isn't aligned, subtract 4 to make it so, and pretend the shift
+ C is 32 bits extra. High limb of dst (marked xxx) handled here
+ C separately.
+ C
+ C source -8(ebx,%eax,4)
+ C |
+ C +-------+-------+--
+ C | mm1 |
+ C +-------+-------+--
+ C 0mod8 4mod8
+ C
+ C dest
+ C -4(edx,%eax,4)
+ C |
+ C +-------+-------+-------+--
+ C | xxx | |
+ C +-------+-------+-------+--
+ C 0mod8 4mod8 0mod8
+
+ movq %mm1, %mm0
+ addl $32, %ecx C new shift
+
+ psllq %mm6, %mm0
+
+ movd %ecx, %mm6
+ psrlq $32, %mm0
+
+ C wasted cycle here waiting for %mm0
+
+ movd %mm0, -4(%edx,%eax,4)
+ subl $4, %edx
+L(start_dst_aligned):
+
+
+ psllq %mm6, %mm1
+ negl %ecx C -shift
+
+ addl $64, %ecx C 64-shift
+ movq %mm3, %mm2
+
+ movd %ecx, %mm7
+ subl $8, %eax C size-8
+
+ psrlq %mm7, %mm3
+
+ por %mm1, %mm3 C mm3 ready to store
+ jc L(finish)
+
+
+ C The comments in mpn_rshift apply here too.
+
+ ALIGN(8)
+L(unroll_loop):
+ C eax counter, limbs
+ C ebx src
+ C ecx
+ C edx dst
+ C esi
+ C edi
+ C
+ C mm0
+ C mm1
+ C mm2 src qword from 16(%ebx,%eax,4)
+ C mm3 dst qword ready to store to 24(%edx,%eax,4)
+ C
+ C mm5 return value
+ C mm6 lshift
+ C mm7 rshift
+
+ movq 8(%ebx,%eax,4), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ movq %mm3, 24(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq (%ebx,%eax,4), %mm3 C
+ psllq %mm6, %mm1 C
+
+ movq %mm0, 16(%edx,%eax,4)
+ movq %mm3, %mm2 C
+
+ psrlq %mm7, %mm3 C
+ subl $4, %eax
+
+ por %mm1, %mm3 C
+ jnc L(unroll_loop)
+
+
+
+L(finish):
+ C eax -4 to -1 representing respectively 0 to 3 limbs remaining
+
+ testb $2, %al
+
+ jz L(finish_no_two)
+
+ movq 8(%ebx,%eax,4), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ movq %mm3, 24(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq %mm1, %mm2
+ movq %mm0, %mm3
+
+ subl $2, %eax
+L(finish_no_two):
+
+
+ C eax -4 or -3 representing respectively 0 or 1 limbs remaining
+ C
+ C mm2 src prev qword, from 16(%ebx,%eax,4)
+ C mm3 dst qword, for 24(%edx,%eax,4)
+
+ testb $1, %al
+ movd %mm5, %eax C retval
+
+ popl %edi
+ jz L(finish_zero)
+
+
+ C One extra src limb, destination was aligned.
+ C
+ C source ebx
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edx+12 edx+4 edx
+ C --+---------------+---------------+-------+
+ C | mm3 | | |
+ C --+---------------+---------------+-------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C One extra src limb, destination was unaligned.
+ C
+ C source ebx
+ C --+---------------+-------+
+ C | mm2 | |
+ C --+---------------+-------+
+ C
+ C dest edx+12 edx+4
+ C --+---------------+---------------+
+ C | mm3 | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at 4(%edx), and in the aligned case
+ C there's an extra limb of dst to be formed from that extra src limb
+ C left shifted.
+
+
+ movd (%ebx), %mm0
+ psllq %mm6, %mm2
+
+ movq %mm3, 12(%edx)
+ psllq $32, %mm0
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm2, %mm0
+ psllq %mm6, %mm1
+
+ movq %mm0, 4(%edx)
+ psrlq $32, %mm1
+
+ andl $32, %ecx
+ popl %ebx
+
+ jz L(finish_one_unaligned)
+
+ movd %mm1, (%edx)
+L(finish_one_unaligned):
+
+ emms
+
+ ret
+
+
+L(finish_zero):
+
+ C No extra src limbs, destination was aligned.
+ C
+ C source ebx
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edx+8 edx
+ C --+---------------+---------------+
+ C | mm3 | |
+ C --+---------------+---------------+
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C No extra src limbs, destination was unaligned.
+ C
+ C source ebx
+ C --+---------------+
+ C | mm2 |
+ C --+---------------+
+ C
+ C dest edx+8 edx+4
+ C --+---------------+-------+
+ C | mm3 | |
+ C --+---------------+-------+
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C The movd for the unaligned case writes the same data to 4(%edx)
+ C that the movq does for the aligned case.
+
+
+ movq %mm3, 8(%edx)
+ andl $32, %ecx
+
+ psllq %mm6, %mm2
+ jz L(finish_zero_unaligned)
+
+ movq %mm2, (%edx)
+L(finish_zero_unaligned):
+
+ psrlq $32, %mm2
+ popl %ebx
+
+ movd %mm5, %eax C retval
+
+ movd %mm2, 4(%edx)
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
new file mode 100644
index 0000000..4ced577
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/mul_1.asm
@@ -0,0 +1,371 @@
+dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5: 12.0 for 32-bit multiplier
+C 7.0 for 16-bit multiplier
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+C
+C When the multiplier is 16 bits some special case MMX code is used. Small
+C multipliers might arise reasonably often from mpz_mul_ui etc. If the size
+C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
+C size==8 end up being quite close. If src isn't aligned to an 8 byte
+C boundary then one limb is processed separately with roughly a 5 cycle
+C penalty, so in that case it's say size==8 and size==9 which are close.
+C
+C Alternatives:
+C
+C MMX is not believed to be of any use for 32-bit multipliers, since for
+C instance the current method would just have to be more or less duplicated
+C for the high and low halves of the multiplier, and would probably
+C therefore run at about 14 cycles, which is slower than the plain integer
+C at 12.
+C
+C Adding the high and low MMX products using integer code seems best. An
+C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
+C any joy. Perhaps something could be done keeping the values signed and
+C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
+C perhaps not.
+C
+C Future:
+C
+C An mpn_mul_1c entrypoint would need a double carry out of the low result
+C limb in the 16-bit code, unless it could be assumed the carry fits in 16
+C bits, possibly as carry<multiplier, this being true of a big calculation
+C done piece by piece. But let's worry about that if/when mul_1c is
+C actually used.
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+ ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ cmpl $1, %ecx
+ jne L(two_or_more)
+
+ C one limb only
+
+ movl PARAM_MULTIPLIER, %eax
+ movl PARAM_DST, %ecx
+
+ mull (%edx)
+
+ movl %eax, (%ecx)
+ movl %edx, %eax
+
+ ret
+
+
+L(two_or_more):
+ C eax size
+ C ebx
+ C ecx carry
+ C edx
+ C esi src
+ C edi
+ C ebp
+
+ pushl %esi FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ movl %edx, %esi C src
+ movl PARAM_DST, %edi
+
+ movl PARAM_MULTIPLIER, %eax
+ pushl %ebx FRAME_pushl()
+
+ leal (%esi,%ecx,4), %esi C src end
+ leal (%edi,%ecx,4), %edi C dst end
+
+ negl %ecx C -size
+
+ pushl %ebp FRAME_pushl()
+ cmpl $65536, %eax
+
+ jb L(small)
+
+
+L(big):
+ xorl %ebx, %ebx C carry limb
+ sarl %ecx C -size/2
+
+ jnc L(top) C with carry flag clear
+
+
+ C size was odd, process one limb separately
+
+ mull 4(%esi,%ecx,8) C m * src[0]
+
+ movl %eax, 4(%edi,%ecx,8)
+ incl %ecx
+
+ orl %edx, %ebx C carry limb, and clear carry flag
+
+
+L(top):
+ C eax
+ C ebx carry
+ C ecx counter, negative
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp (scratch carry)
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,8), %eax
+
+ mull PARAM_MULTIPLIER
+
+ movl %edx, %ebp
+ addl %eax, %ebx
+
+ adcl $0, %ebp
+ movl 4(%esi,%ecx,8), %eax
+
+ mull PARAM_MULTIPLIER
+
+ movl %ebx, (%edi,%ecx,8)
+ addl %ebp, %eax
+
+ movl %eax, 4(%edi,%ecx,8)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(top)
+
+
+ adcl $0, %ebx
+ popl %ebp
+
+ movl %ebx, %eax
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+
+L(small):
+ C Special case for 16-bit multiplier.
+ C
+ C eax multiplier
+ C ebx
+ C ecx -size
+ C edx src
+ C esi src end
+ C edi dst end
+ C ebp multiplier
+
+ C size<3 not supported here. At size==3 we're already a couple of
+ C cycles faster, so there's no threshold as such, just use the MMX
+ C as soon as possible.
+
+ cmpl $-3, %ecx
+ ja L(big)
+
+ movd %eax, %mm7 C m
+ pxor %mm6, %mm6 C initial carry word
+
+ punpcklwd %mm7, %mm7 C m replicated 2 times
+ addl $2, %ecx C -size+2
+
+ punpckldq %mm7, %mm7 C m replicated 4 times
+ andl $4, %edx C test alignment, clear carry flag
+
+ movq %mm7, %mm0 C m
+ jz L(small_entry)
+
+
+ C Source is unaligned, process one limb separately.
+ C
+ C Plain integer code is used here, since it's smaller and is about
+ C the same 13 cycles as an mmx block would be.
+ C
+ C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
+ C the use of separate incl and orl.
+
+ mull -8(%esi,%ecx,4) C m * src[0]
+
+ movl %eax, -8(%edi,%ecx,4) C dst[0]
+ incl %ecx C one limb processed
+
+ movd %edx, %mm6 C initial carry
+
+ orl %eax, %eax C clear carry flag
+ jmp L(small_entry)
+
+
+C The scheduling here is quite tricky, since so many instructions have
+C pairing restrictions. In particular the js won't pair with a movd, and
+C can't be paired with an adc since it wants flags from the inc, so
+C instructions are rotated to the top of the loop to find somewhere useful
+C for it.
+C
+C Trouble has been taken to avoid overlapping successive loop iterations,
+C since that would greatly increase the size of the startup and finishup
+C code. Actually there's probably not much advantage to be had from
+C overlapping anyway, since the difficulties are mostly with pairing, not
+C with latencies as such.
+C
+C In the comments x represents the src data and m the multiplier (16
+C bits, but replicated 4 times).
+C
+C The m signs calculated in %mm3 are a loop invariant and could be held in
+C say %mm5, but that would save only one instruction and hence be no faster.
+
+L(small_top):
+ C eax l.low, then l.high
+ C ebx (h.low)
+ C ecx counter, -size+2 to 0 or 1
+ C edx (h.high)
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+ C
+ C %mm0 (high products)
+ C %mm1 (low products)
+ C %mm2 (adjust for m using x signs)
+ C %mm3 (adjust for x using m signs)
+ C %mm4
+ C %mm5
+ C %mm6 h.low, then carry
+ C %mm7 m replicated 4 times
+
+ movd %mm6, %ebx C h.low
+ psrlq $32, %mm1 C l.high
+
+ movd %mm0, %edx C h.high
+ movq %mm0, %mm6 C new c
+
+ adcl %eax, %ebx
+ incl %ecx
+
+ movd %mm1, %eax C l.high
+ movq %mm7, %mm0
+
+ adcl %eax, %edx
+ movl %ebx, -16(%edi,%ecx,4)
+
+ movl %edx, -12(%edi,%ecx,4)
+ psrlq $32, %mm6 C c
+
+L(small_entry):
+ pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high
+ movq %mm7, %mm1
+
+ pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low
+ movq %mm7, %mm3
+
+ movq -8(%esi,%ecx,4), %mm2 C x
+ psraw $15, %mm3 C m signs
+
+ pand -8(%esi,%ecx,4), %mm3 C x selected by m signs
+ psraw $15, %mm2 C x signs
+
+ paddw %mm3, %mm0 C add x to h if m neg
+ pand %mm7, %mm2 C m selected by x signs
+
+ paddw %mm2, %mm0 C add m to h if x neg
+ incl %ecx
+
+ movd %mm1, %eax C l.low
+ punpcklwd %mm0, %mm6 C c + h.low << 16
+
+ psrlq $16, %mm0 C h.high
+ js L(small_top)
+
+
+
+
+ movd %mm6, %ebx C h.low
+ psrlq $32, %mm1 C l.high
+
+ adcl %eax, %ebx
+ popl %ebp FRAME_popl()
+
+ movd %mm0, %edx C h.high
+ psrlq $32, %mm0 C l.high
+
+ movd %mm1, %eax C l.high
+
+ adcl %eax, %edx
+ movl %ebx, -12(%edi,%ecx,4)
+
+ movd %mm0, %eax C c
+
+ adcl $0, %eax
+ movl %edx, -8(%edi,%ecx,4)
+
+ orl %ecx, %ecx
+ jnz L(small_done) C final %ecx==1 means even, ==0 odd
+
+
+ C Size odd, one extra limb to process.
+ C Plain integer code is used here, since it's smaller and is about
+ C the same speed as another mmx block would be.
+
+ movl %eax, %ecx
+ movl PARAM_MULTIPLIER, %eax
+
+ mull -4(%esi)
+
+ addl %ecx, %eax
+
+ adcl $0, %edx
+ movl %eax, -4(%edi)
+
+ movl %edx, %eax
+L(small_done):
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm
new file mode 100644
index 0000000..e3b274b
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,468 @@
+dnl Intel P5 mpn_rshift -- mpn right shift.
+
+dnl Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left. Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned. Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+ pushl %ebx
+ pushl %edi
+deflit(`FRAME',8)
+
+ movl PARAM_SIZE, %eax
+ movl PARAM_DST, %edx
+
+ movl PARAM_SRC, %ebx
+ movl PARAM_SHIFT, %ecx
+
+ cmp $UNROLL_THRESHOLD, %eax
+ jae L(unroll)
+
+ decl %eax
+ movl (%ebx), %edi C src low limb
+
+ jnz L(simple)
+
+ shrdl( %cl, %edi, %eax) C eax was decremented to zero
+
+ shrl %cl, %edi
+
+ movl %edi, (%edx) C dst low limb
+ popl %edi C risk of data cache bank clash
+
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(simple):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx), %mm5 C src[0]
+ leal (%ebx,%eax,4), %ebx C &src[size-1]
+
+ movd %ecx, %mm6 C rshift
+ leal -4(%edx,%eax,4), %edx C &dst[size-2]
+
+ psllq $32, %mm5
+ negl %eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop. Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+ C eax counter, limbs, negative
+ C ebx &src[size-1]
+ C ecx return value
+ C edx &dst[size-2]
+ C
+ C mm0 scratch
+ C mm5 return value
+ C mm6 shift
+
+ movq (%ebx,%eax,4), %mm0
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+ movd %mm0, (%edx,%eax,4)
+ jnz L(simple_top)
+
+
+ movd (%ebx), %mm0
+ psrlq %mm6, %mm5 C return value
+
+ psrlq %mm6, %mm0
+ popl %edi
+
+ movd %mm5, %eax
+ popl %ebx
+
+ movd %mm0, 4(%edx)
+
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(unroll):
+ C eax size
+ C ebx src
+ C ecx shift
+ C edx dst
+ C esi
+ C edi
+ C ebp
+deflit(`FRAME',8)
+
+ movd (%ebx), %mm5 C src[0]
+ movl $4, %edi
+
+ movd %ecx, %mm6 C rshift
+ testl %edi, %ebx
+
+ psllq $32, %mm5
+ jz L(start_src_aligned)
+
+
+ C src isn't aligned, process low limb separately (marked xxx) and
+ C step src and dst by one limb, making src aligned.
+ C
+ C source ebx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+ C
+ C dest edx
+ C --+-------+-------+
+ C | | xxx |
+ C --+-------+-------+
+
+ movq (%ebx), %mm0 C unaligned load
+
+ psrlq %mm6, %mm0
+ addl $4, %ebx
+
+ decl %eax
+
+ movd %mm0, (%edx)
+ addl $4, %edx
+L(start_src_aligned):
+
+
+ movq (%ebx), %mm1
+ testl %edi, %edx
+
+ psrlq %mm6, %mm5 C retval
+ jz L(start_dst_aligned)
+
+ C dst isn't aligned, add 4 to make it so, and pretend the shift is
+ C 32 bits extra. Low limb of dst (marked xxx) handled here
+ C separately.
+ C
+ C source ebx
+ C --+-------+-------+
+ C | mm1 |
+ C --+-------+-------+
+ C 4mod8 0mod8
+ C
+ C dest edx
+ C --+-------+-------+-------+
+ C | xxx |
+ C --+-------+-------+-------+
+ C 4mod8 0mod8 4mod8
+
+ movq %mm1, %mm0
+ addl $32, %ecx C new shift
+
+ psrlq %mm6, %mm0
+
+ movd %ecx, %mm6
+
+ movd %mm0, (%edx)
+ addl $4, %edx
+L(start_dst_aligned):
+
+
+ movq 8(%ebx), %mm3
+ negl %ecx
+
+ movq %mm3, %mm2 C mm2 src qword
+ addl $64, %ecx
+
+ movd %ecx, %mm7
+ psrlq %mm6, %mm1
+
+ leal -12(%ebx,%eax,4), %ebx
+ leal -20(%edx,%eax,4), %edx
+
+ psllq %mm7, %mm3
+ subl $7, %eax C size-7
+
+ por %mm1, %mm3 C mm3 ready to store
+ negl %eax C -(size-7)
+
+ jns L(finish)
+
+
+ C This loop is the important bit, the rest is just support. Careful
+ C instruction scheduling achieves the claimed 1.75 c/l. The
+ C relevant parts of the pairing rules are:
+ C
+ C - mmx loads and stores execute only in the U pipe
+ C - only one mmx shift in a pair
+ C - wait one cycle before storing an mmx register result
+ C - the usual address generation interlock
+ C
+ C Two qword calculations are slightly interleaved. The instructions
+ C marked "C" belong to the second qword, and the "C prev" one is for
+ C the second qword from the previous iteration.
+
+ ALIGN(8)
+L(unroll_loop):
+ C eax counter, limbs, negative
+ C ebx &src[size-12]
+ C ecx
+ C edx &dst[size-12]
+ C esi
+ C edi
+ C
+ C mm0
+ C mm1
+ C mm2 src qword from -8(%ebx,%eax,4)
+ C mm3 dst qword ready to store to -8(%edx,%eax,4)
+ C
+ C mm5 return value
+ C mm6 rshift
+ C mm7 lshift
+
+ movq (%ebx,%eax,4), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, -8(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq 8(%ebx,%eax,4), %mm3 C
+ psrlq %mm6, %mm1 C
+
+ movq %mm0, (%edx,%eax,4)
+ movq %mm3, %mm2 C
+
+ psllq %mm7, %mm3 C
+ addl $4, %eax
+
+ por %mm1, %mm3 C
+ js L(unroll_loop)
+
+
+L(finish):
+ C eax 0 to 3 representing respectively 3 to 0 limbs remaining
+
+ testb $2, %al
+
+ jnz L(finish_no_two)
+
+ movq (%ebx,%eax,4), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, -8(%edx,%eax,4) C prev
+ por %mm2, %mm0
+
+ movq %mm1, %mm2
+ movq %mm0, %mm3
+
+ addl $2, %eax
+L(finish_no_two):
+
+
+ C eax 2 or 3 representing respectively 1 or 0 limbs remaining
+ C
+ C mm2 src prev qword, from -8(%ebx,%eax,4)
+ C mm3 dst qword, for -8(%edx,%eax,4)
+
+ testb $1, %al
+ popl %edi
+
+ movd %mm5, %eax C retval
+ jnz L(finish_zero)
+
+
+ C One extra limb, destination was aligned.
+ C
+ C source ebx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edx
+ C +-------+---------------+---------------+--
+ C | | | mm3 |
+ C +-------+---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C One extra limb, destination was unaligned.
+ C
+ C source ebx
+ C +-------+---------------+--
+ C | | mm2 |
+ C +-------+---------------+--
+ C
+ C dest edx
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = ecx = 64-(shift+32)
+
+
+ C In both cases there's one extra limb of src to fetch and combine
+ C with mm2 to make a qword at 8(%edx), and in the aligned case
+ C there's a further extra limb of dst to be formed.
+
+
+ movd 8(%ebx), %mm0
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ movq %mm3, (%edx)
+ por %mm2, %mm0
+
+ psrlq %mm6, %mm1
+ andl $32, %ecx
+
+ popl %ebx
+ jz L(finish_one_unaligned)
+
+ C dst was aligned, must store one extra limb
+ movd %mm1, 16(%edx)
+L(finish_one_unaligned):
+
+ movq %mm0, 8(%edx)
+
+ emms
+
+ ret
+
+
+L(finish_zero):
+
+ C No extra limbs, destination was aligned.
+ C
+ C source ebx
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edx+4
+ C +---------------+---------------+--
+ C | | mm3 |
+ C +---------------+---------------+--
+ C
+ C mm6 = shift
+ C mm7 = ecx = 64-shift
+
+
+ C No extra limbs, destination was unaligned.
+ C
+ C source ebx
+ C +---------------+--
+ C | mm2 |
+ C +---------------+--
+ C
+ C dest edx+4
+ C +-------+---------------+--
+ C | | mm3 |
+ C +-------+---------------+--
+ C
+ C mm6 = shift+32
+ C mm7 = 64-(shift+32)
+
+
+ C The movd for the unaligned case is clearly the same data as the
+ C movq for the aligned case, it's just a choice between whether one
+ C or two limbs should be written.
+
+
+ movq %mm3, 4(%edx)
+ psrlq %mm6, %mm2
+
+ movd %mm2, 12(%edx)
+ andl $32, %ecx
+
+ popl %ebx
+ jz L(finish_zero_unaligned)
+
+ movq %mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+ emms
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm
new file mode 100644
index 0000000..2d88223
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mod_34lsub1.asm
@@ -0,0 +1,192 @@
+dnl Intel P5 mpn_mod_34lsub1 -- mpn remainder modulo 2**24-1.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.66 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ subl $2, %ecx
+ ja L(three_or_more)
+
+ movl (%edx), %eax
+ jne L(one)
+
+
+ movl 4(%edx), %ecx
+ movl %eax, %edx
+
+ shrl $24, %edx
+ andl $0xFFFFFF, %eax
+
+ addl %edx, %eax
+ movl %ecx, %edx
+
+ shrl $16, %ecx
+ andl $0xFFFF, %edx
+
+ shll $8, %edx
+ addl %ecx, %eax
+
+ addl %edx, %eax
+
+L(one):
+ ret
+
+
+L(three_or_more):
+ C eax
+ C ebx
+ C ecx size-2
+ C edx src
+ C esi
+ C edi
+ C ebp
+
+ pushl %ebx FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ pushl %edi FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ xorl %esi, %esi C 0mod3
+ xorl %edi, %edi C 1mod3
+
+ xorl %ebp, %ebp C 2mod3, and clear carry
+
+L(top):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, limbs
+ C edx src
+ C esi 0mod3
+ C edi 1mod3
+ C ebp 2mod3
+
+ movl (%edx), %eax
+ movl 4(%edx), %ebx
+
+ adcl %eax, %esi
+ movl 8(%edx), %eax
+
+ adcl %ebx, %edi
+ leal 12(%edx), %edx
+
+ adcl %eax, %ebp
+ leal -2(%ecx), %ecx
+
+ decl %ecx
+ jg L(top)
+
+
+ C ecx is -2, -1 or 0, representing 0, 1 or 2 more limbs, respectively
+
+ movl $0xFFFFFFFF, %ebx C mask
+ incl %ecx
+
+ js L(combine) C 0 more
+
+ movl (%edx), %eax
+ movl $0xFFFFFF00, %ebx
+
+ adcl %eax, %esi
+ decl %ecx
+
+ js L(combine) C 1 more
+
+ movl 4(%edx), %eax
+ movl $0xFFFF0000, %ebx
+
+ adcl %eax, %edi
+
+
+
+L(combine):
+ C eax
+ C ebx mask
+ C ecx
+ C edx
+ C esi 0mod3
+ C edi 1mod3
+ C ebp 2mod3
+
+ sbbl %ecx, %ecx C carry
+ movl %esi, %eax C 0mod3
+
+ andl %ebx, %ecx C masked for position
+ andl $0xFFFFFF, %eax C 0mod3 low
+
+ shrl $24, %esi C 0mod3 high
+ subl %ecx, %eax C apply carry
+
+ addl %esi, %eax C apply 0mod3
+ movl %edi, %ebx C 1mod3
+
+ shrl $16, %edi C 1mod3 high
+ andl $0x0000FFFF, %ebx
+
+ shll $8, %ebx C 1mod3 low
+ addl %edi, %eax C apply 1mod3 high
+
+ addl %ebx, %eax C apply 1mod3 low
+ movl %ebp, %ebx C 2mod3
+
+ shrl $8, %ebp C 2mod3 high
+ andl $0xFF, %ebx
+
+ shll $16, %ebx C 2mod3 low
+ addl %ebp, %eax C apply 2mod3 high
+
+ addl %ebx, %eax C apply 2mod3 low
+
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ popl %ebx
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm
new file mode 100644
index 0000000..a90abca
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mode1o.asm
@@ -0,0 +1,279 @@
+dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2000-2002, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 23.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C There seems no way to pair up the two lone instructions in the main loop.
+C
+C The special case for size==1 saves about 20 cycles (non-PIC), making it
+C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
+C all sizes.
+C
+C Alternatives:
+C
+C Using mmx for the multiplies might be possible, with pmullw and pmulhw
+C having just 3 cycle latencies, but carry bit handling would probably be
+C complicated.
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-using parameter space
+define(VAR_INVERSE,`PARAM_SIZE')
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ movl PARAM_CARRY, %edx
+
+ jmp L(start_1c)
+
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ xorl %edx, %edx C carry
+
+L(start_1c):
+
+ifdef(`PIC',`
+ifdef(`DARWIN',`
+ shrl %eax C d/2
+ LEA( binvert_limb_table, %ecx)
+ pushl %ebx FRAME_pushl()
+ movl PARAM_SIZE, %ebx
+
+ andl $127, %eax
+ subl $2, %ebx
+
+ movb (%eax,%ecx), %cl
+ jc L(one_limb)
+',`
+ call L(here) FRAME_pushl()
+L(here):
+
+ shrl %eax C d/2
+ movl (%esp), %ecx C eip
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
+ movl %ebx, (%esp) C push ebx
+
+ andl $127, %eax
+ movl PARAM_SIZE, %ebx
+
+ movl binvert_limb_table@GOT(%ecx), %ecx
+ subl $2, %ebx
+
+ movb (%eax,%ecx), %cl C inv 8 bits
+ jc L(one_limb)
+')
+',`
+dnl non-PIC
+ shrl %eax C d/2
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SIZE, %ebx
+ andl $127, %eax
+
+ subl $2, %ebx
+ jc L(one_limb)
+
+ movb binvert_limb_table(%eax), %cl C inv 8 bits
+')
+
+ movl %ecx, %eax
+ addl %ecx, %ecx C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ imull PARAM_DIVISOR, %eax C inv*inv*d
+
+ subl %eax, %ecx C inv = 2*inv - inv*inv*d
+
+ movl %ecx, %eax
+ addl %ecx, %ecx C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ imull PARAM_DIVISOR, %eax C inv*inv*d
+
+ subl %eax, %ecx C inv = 2*inv - inv*inv*d
+ pushl %esi FRAME_pushl()
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ movl %ecx, %eax
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax')
+
+ movl PARAM_SRC, %esi
+ movl %ecx, VAR_INVERSE
+
+ movl (%esi), %eax C src[0]
+ leal 4(%esi,%ebx,4), %esi C &src[size-1]
+
+ xorl $-1, %ebx C -(size-1)
+ ASSERT(nz)
+ jmp L(entry)
+
+
+C The use of VAR_INVERSE means only a store is needed for that value, rather
+C than a push and pop of say %edi.
+
+ ALIGN(16)
+L(top):
+ C eax scratch, low product
+ C ebx counter, limbs, negative
+ C ecx carry bit
+ C edx scratch, high product
+ C esi &src[size-1]
+ C edi
+ C ebp
+
+ mull PARAM_DIVISOR C h:dummy = q*d
+
+ movl (%esi,%ebx,4), %eax C src[i]
+ subl %ecx, %edx C h -= -c
+
+L(entry):
+ subl %edx, %eax C s = src[i] - h
+
+ sbbl %ecx, %ecx C new -c (0 or -1)
+
+ imull VAR_INVERSE, %eax C q = s*i
+
+ incl %ebx
+ jnz L(top)
+
+
+ mull PARAM_DIVISOR
+
+ movl (%esi), %eax C src high
+ subl %ecx, %edx C h -= -c
+
+ cmpl PARAM_DIVISOR, %eax
+
+ jbe L(skip_last)
+deflit(FRAME_LAST,FRAME)
+
+
+ subl %edx, %eax C s = src[i] - h
+ popl %esi FRAME_popl()
+
+ sbbl %ecx, %ecx C c (0 or -1)
+ popl %ebx FRAME_popl()
+
+ imull VAR_INVERSE, %eax C q = s*i
+
+ mull PARAM_DIVISOR C h:dummy = q*d
+
+ movl %edx, %eax
+
+ subl %ecx, %eax
+
+ ret
+
+
+C When high<divisor can skip last step.
+
+L(skip_last):
+deflit(`FRAME',FRAME_LAST)
+ C eax src high
+ C ebx
+ C ecx
+ C edx r
+ C esi
+
+ subl %eax, %edx C r-s
+ popl %esi FRAME_popl()
+
+ sbbl %eax, %eax C -1 if underflow
+ movl PARAM_DIVISOR, %ebx
+
+ andl %ebx, %eax C divisor if underflow
+ popl %ebx FRAME_popl()
+
+ addl %edx, %eax C addback if underflow
+
+ ret
+
+
+C Special case for size==1 using a division for r = c-a mod d.
+C Could look for a-c<d and save a division sometimes, but that doesn't seem
+C worth bothering about.
+
+L(one_limb):
+deflit(`FRAME',4)
+ C eax
+ C ebx size-2 (==-1)
+ C ecx
+ C edx carry
+ C esi src end
+ C edi
+ C ebp
+
+ movl %edx, %eax
+ movl PARAM_SRC, %edx
+
+ movl PARAM_DIVISOR, %ecx
+ popl %ebx FRAME_popl()
+
+ subl (%edx), %eax C c-a
+
+ sbbl %edx, %edx
+ decl %ecx C d-1
+
+ andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a
+
+ divl PARAM_DIVISOR
+
+ movl %edx, %eax
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm
new file mode 100644
index 0000000..a0858af
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_1.asm
@@ -0,0 +1,177 @@
+dnl Intel Pentium mpn_mul_1 -- mpn by limb multiplication.
+
+dnl Copyright 1992, 1994, 1996, 1999, 2000, 2002 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 12.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier);
+C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t multiplier, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mul_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %ecx
+ pushl %esi FRAME_pushl()
+
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+ xorl %ecx, %ecx
+ pushl %esi FRAME_pushl()
+
+L(start_1c):
+ movl PARAM_SRC, %esi
+ movl PARAM_SIZE, %eax
+
+ shrl %eax
+ jnz L(two_or_more)
+
+
+ C one limb only
+
+ movl (%esi), %eax
+
+ mull PARAM_MULTIPLIER
+
+ addl %eax, %ecx
+ movl PARAM_DST, %eax
+
+ adcl $0, %edx
+ popl %esi
+
+ movl %ecx, (%eax)
+ movl %edx, %eax
+
+ ret
+
+
+L(two_or_more):
+ C eax size/2
+ C ebx
+ C ecx carry
+ C edx
+ C esi src
+ C edi
+ C ebp
+
+ pushl %edi FRAME_pushl()
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_DST, %edi
+ leal -1(%eax), %ebx C size/2-1
+
+ notl %ebx C -size, preserve carry
+
+ leal (%esi,%eax,8), %esi C src end
+ leal (%edi,%eax,8), %edi C dst end
+
+ pushl %ebp FRAME_pushl()
+ jnc L(top)
+
+
+ C size was odd, process one limb separately
+
+ movl (%esi,%ebx,8), %eax
+ addl $4, %esi
+
+ mull PARAM_MULTIPLIER
+
+ addl %ecx, %eax
+ movl %edx, %ecx
+
+ movl %eax, (%edi,%ebx,8)
+ leal 4(%edi), %edi
+
+
+L(top):
+ C eax
+ C ebx counter, negative
+ C ecx carry
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp
+
+ adcl $0, %ecx
+ movl (%esi,%ebx,8), %eax
+
+ mull PARAM_MULTIPLIER
+
+ movl %edx, %ebp
+ addl %eax, %ecx
+
+ adcl $0, %ebp
+ movl 4(%esi,%ebx,8), %eax
+
+ mull PARAM_MULTIPLIER
+
+ movl %ecx, (%edi,%ebx,8)
+ addl %ebp, %eax
+
+ movl %eax, 4(%edi,%ebx,8)
+ incl %ebx
+
+ movl %edx, %ecx
+ jnz L(top)
+
+
+ adcl $0, %ecx
+ popl %ebp
+
+ movl %ecx, %eax
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm
new file mode 100644
index 0000000..4c7beb5
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_2.asm
@@ -0,0 +1,150 @@
+dnl Intel Pentium mpn_mul_2 -- mpn by 2-limb multiplication.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 24.0 cycles/limb
+
+
+C mp_limb_t mpn_mul_2 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_srcptr mult);
+C
+C At 24 c/l this is only 2 cycles faster than a separate mul_1 and addmul_1,
+C but has the advantage of making just one pass over the operands.
+C
+C There's not enough registers to use PARAM_MULT directly, so the multiplier
+C limbs are transferred to local variables on the stack.
+
+defframe(PARAM_MULT, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_MULT_LOW, `PARAM_SRC')
+define(VAR_MULT_HIGH,`PARAM_DST')
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mul_2)
+deflit(`FRAME',0)
+
+ pushl %esi FRAME_pushl()
+ pushl %edi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ movl PARAM_MULT, %eax
+ movl PARAM_SIZE, %ecx
+
+ movl 4(%eax), %edx C mult high
+ movl (%eax), %eax C mult low
+
+ movl %eax, VAR_MULT_LOW
+ movl %edx, VAR_MULT_HIGH
+
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+
+ mull (%esi) C src[0] * mult[0]
+
+ movl %eax, %ebp C in case src==dst
+ movl (%esi), %eax C src[0]
+
+ movl %ebp, (%edi) C dst[0]
+ movl %edx, %ebx C initial low carry
+
+ xorl %ebp, %ebp C initial high carry
+ leal (%edi,%ecx,4), %edi C dst end
+
+ mull VAR_MULT_HIGH C src[0] * mult[1]
+
+ subl $2, %ecx C size-2
+ js L(done)
+
+ leal 8(%esi,%ecx,4), %esi C &src[size]
+ xorl $-1, %ecx C -(size-1)
+
+
+
+L(top):
+ C eax low prod
+ C ebx low carry
+ C ecx counter, negative
+ C edx high prod
+ C esi src end
+ C edi dst end
+ C ebp high carry (0 or -1)
+
+ andl $1, %ebp C 1 or 0
+ addl %eax, %ebx
+
+ adcl %edx, %ebp
+ ASSERT(nc)
+ movl (%esi,%ecx,4), %eax
+
+ mull VAR_MULT_LOW
+
+ addl %eax, %ebx C low carry
+ movl (%esi,%ecx,4), %eax
+
+ adcl %ebp, %edx C high carry
+ movl %ebx, (%edi,%ecx,4)
+
+ sbbl %ebp, %ebp C new high carry, -1 or 0
+ movl %edx, %ebx C new low carry
+
+ mull VAR_MULT_HIGH
+
+ incl %ecx
+ jnz L(top)
+
+
+L(done):
+ andl $1, %ebp C 1 or 0
+ addl %ebx, %eax
+
+ adcl %ebp, %edx
+ ASSERT(nc)
+ movl %eax, (%edi) C store carry low
+
+ movl %edx, %eax C return carry high
+
+ popl %ebp
+ popl %ebx
+
+ popl %edi
+ popl %esi
+
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm
new file mode 100644
index 0000000..e1d0f05
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/mul_basecase.asm
@@ -0,0 +1,142 @@
+dnl Intel Pentium mpn_mul_basecase -- mpn by mpn multiplication.
+
+dnl Copyright 1996, 1998-2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 14.2 cycles/crossproduct (approx)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+
+defframe(PARAM_YSIZE, 20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE, 12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+defframe(VAR_COUNTER, -4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_mul_basecase)
+
+ pushl %eax C dummy push for allocating stack slot
+ pushl %esi
+ pushl %ebp
+ pushl %edi
+deflit(`FRAME',16)
+
+ movl PARAM_XP,%esi
+ movl PARAM_WP,%edi
+ movl PARAM_YP,%ebp
+
+ movl (%esi),%eax C load xp[0]
+ mull (%ebp) C multiply by yp[0]
+ movl %eax,(%edi) C store to wp[0]
+ movl PARAM_XSIZE,%ecx C xsize
+ decl %ecx C If xsize = 1, ysize = 1 too
+ jz L(done)
+
+ movl PARAM_XSIZE,%eax
+ pushl %ebx
+FRAME_pushl()
+ movl %edx,%ebx
+ leal (%esi,%eax,4),%esi C make xp point at end
+ leal (%edi,%eax,4),%edi C offset wp by xsize
+ negl %ecx C negate j size/index for inner loop
+ xorl %eax,%eax C clear carry
+
+ ALIGN(8)
+L(oop1): adcl $0,%ebx
+ movl (%esi,%ecx,4),%eax C load next limb at xp[j]
+ mull (%ebp)
+ addl %ebx,%eax
+ movl %eax,(%edi,%ecx,4)
+ incl %ecx
+ movl %edx,%ebx
+ jnz L(oop1)
+
+ adcl $0,%ebx
+ movl PARAM_YSIZE,%eax
+ movl %ebx,(%edi) C most significant limb of product
+ addl $4,%edi C increment wp
+ decl %eax
+ jz L(skip)
+ movl %eax,VAR_COUNTER C set index i to ysize
+
+L(outer):
+ addl $4,%ebp C make ebp point to next y limb
+ movl PARAM_XSIZE,%ecx
+ negl %ecx
+ xorl %ebx,%ebx
+
+ C code at 0x61 here, close enough to aligned
+L(oop2):
+ adcl $0,%ebx
+ movl (%esi,%ecx,4),%eax
+ mull (%ebp)
+ addl %ebx,%eax
+ movl (%edi,%ecx,4),%ebx
+ adcl $0,%edx
+ addl %eax,%ebx
+ movl %ebx,(%edi,%ecx,4)
+ incl %ecx
+ movl %edx,%ebx
+ jnz L(oop2)
+
+ adcl $0,%ebx
+
+ movl %ebx,(%edi)
+ addl $4,%edi
+ movl VAR_COUNTER,%eax
+ decl %eax
+ movl %eax,VAR_COUNTER
+ jnz L(outer)
+
+L(skip):
+ popl %ebx
+ popl %edi
+ popl %ebp
+ popl %esi
+ addl $4,%esp
+ ret
+
+L(done):
+ movl %edx,4(%edi) C store to wp[1]
+ popl %edi
+ popl %ebp
+ popl %esi
+ popl %eax C dummy pop for deallocating stack slot
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm
new file mode 100644
index 0000000..0e82144
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/popcount.asm
@@ -0,0 +1,146 @@
+dnl Intel P5 mpn_popcount -- mpn bit population count.
+
+dnl Copyright 2001, 2002, 2014, 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 8.0 cycles/limb
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C
+C An arithmetic approach has been found to be slower than the table lookup,
+C due to needing too many instructions.
+
+C The slightly strange quoting here helps the renaming done by tune/many.pl.
+deflit(TABLE_NAME,
+m4_assert_defined(`GSYM_PREFIX')
+GSYM_PREFIX`'mpn_popcount``'_table')
+
+C FIXME: exporting the table to hamdist is incorrect as it hurt incremental
+C linking.
+
+ RODATA
+ ALIGN(8)
+ GLOBL TABLE_NAME
+TABLE_NAME:
+forloop(i,0,255,
+` .byte m4_popcount(i)
+')
+
+defframe(PARAM_SIZE,8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_popcount)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ pushl %esi FRAME_pushl()
+
+ifdef(`PIC',`
+ pushl %ebx FRAME_pushl()
+ pushl %ebp FRAME_pushl()
+ifdef(`DARWIN',`
+ shll %ecx C size in byte pairs
+ LEA( TABLE_NAME, %ebp)
+ movl PARAM_SRC, %esi
+ xorl %eax, %eax C total
+ xorl %ebx, %ebx C byte
+ xorl %edx, %edx C byte
+',`
+ call L(here)
+L(here):
+ popl %ebp
+ shll %ecx C size in byte pairs
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
+ movl PARAM_SRC, %esi
+
+ xorl %eax, %eax C total
+ xorl %ebx, %ebx C byte
+
+ movl TABLE_NAME@GOT(%ebp), %ebp
+ xorl %edx, %edx C byte
+')
+define(TABLE,`(%ebp,$1)')
+',`
+dnl non-PIC
+ shll %ecx C size in byte pairs
+ movl PARAM_SRC, %esi
+
+ pushl %ebx FRAME_pushl()
+ xorl %eax, %eax C total
+
+ xorl %ebx, %ebx C byte
+ xorl %edx, %edx C byte
+
+define(TABLE,`TABLE_NAME`'($1)')
+')
+
+
+ ALIGN(8) C necessary on P55 for claimed speed
+L(top):
+ C eax total
+ C ebx byte
+ C ecx counter, 2*size to 2
+ C edx byte
+ C esi src
+ C edi
+ C ebp [PIC] table
+
+ addl %ebx, %eax
+ movb -1(%esi,%ecx,2), %bl
+
+ addl %edx, %eax
+ movb -2(%esi,%ecx,2), %dl
+
+ movb TABLE(%ebx), %bl
+ decl %ecx
+
+ movb TABLE(%edx), %dl
+ jnz L(top)
+
+
+ifdef(`PIC',`
+ popl %ebp
+')
+ addl %ebx, %eax
+ popl %ebx
+
+ addl %edx, %eax
+ popl %esi
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm
new file mode 100644
index 0000000..2105c4c
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/rshift.asm
@@ -0,0 +1,243 @@
+dnl Intel Pentium mpn_rshift -- mpn right shift.
+
+dnl Copyright 1992, 1994-1996, 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5,P54: 6.0
+C P55: 5.375
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The main shift-by-N loop should run at 5.375 c/l and that's what P55 does,
+C but P5 and P54 run only at 6.0 c/l, which is 4 cycles lost somewhere.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_rshift)
+
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ pushl %ebp
+deflit(`FRAME',16)
+
+ movl PARAM_DST,%edi
+ movl PARAM_SRC,%esi
+ movl PARAM_SIZE,%ebp
+ movl PARAM_SHIFT,%ecx
+
+C We can use faster code for shift-by-1 under certain conditions.
+ cmp $1,%ecx
+ jne L(normal)
+ leal 4(%edi),%eax
+ cmpl %esi,%eax
+ jnc L(special) C jump if res_ptr + 1 >= s_ptr
+ leal (%edi,%ebp,4),%eax
+ cmpl %eax,%esi
+ jnc L(special) C jump if s_ptr >= res_ptr + size
+
+L(normal):
+ movl (%esi),%edx
+ addl $4,%esi
+ xorl %eax,%eax
+ shrdl( %cl, %edx, %eax) C compute carry limb
+ pushl %eax C push carry limb onto stack
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+ jz L(end)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(oop): movl 28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ shrdl( %cl, %eax, %ebx)
+ shrdl( %cl, %edx, %eax)
+ movl %ebx,(%edi)
+ movl %eax,4(%edi)
+
+ movl 8(%esi),%ebx
+ movl 12(%esi),%eax
+ shrdl( %cl, %ebx, %edx)
+ shrdl( %cl, %eax, %ebx)
+ movl %edx,8(%edi)
+ movl %ebx,12(%edi)
+
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ shrdl( %cl, %edx, %eax)
+ shrdl( %cl, %ebx, %edx)
+ movl %eax,16(%edi)
+ movl %edx,20(%edi)
+
+ movl 24(%esi),%eax
+ movl 28(%esi),%edx
+ shrdl( %cl, %eax, %ebx)
+ shrdl( %cl, %edx, %eax)
+ movl %ebx,24(%edi)
+ movl %eax,28(%edi)
+
+ addl $32,%esi
+ addl $32,%edi
+ decl %ebp
+ jnz L(oop)
+
+L(end): popl %ebp
+ andl $7,%ebp
+ jz L(end2)
+L(oop2):
+ movl (%esi),%eax
+ shrdl( %cl,%eax,%edx) C compute result limb
+ movl %edx,(%edi)
+ movl %eax,%edx
+ addl $4,%esi
+ addl $4,%edi
+ decl %ebp
+ jnz L(oop2)
+
+L(end2):
+ shrl %cl,%edx C compute most significant limb
+ movl %edx,(%edi) C store it
+
+ popl %eax C pop carry limb
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+
+C We loop from least significant end of the arrays, which is only
+C permissable if the source and destination don't overlap, since the
+C function is documented to work for overlapping source and destination.
+
+L(special):
+ leal -4(%edi,%ebp,4),%edi
+ leal -4(%esi,%ebp,4),%esi
+
+ movl (%esi),%edx
+ subl $4,%esi
+
+ decl %ebp
+ pushl %ebp
+ shrl $3,%ebp
+
+ shrl %edx
+ incl %ebp
+ decl %ebp
+ jz L(Lend)
+
+ movl (%edi),%eax C fetch destination cache line
+
+ ALIGN(4)
+L(Loop):
+ movl -28(%edi),%eax C fetch destination cache line
+ movl %edx,%ebx
+
+ movl (%esi),%eax
+ movl -4(%esi),%edx
+ rcrl %eax
+ movl %ebx,(%edi)
+ rcrl %edx
+ movl %eax,-4(%edi)
+
+ movl -8(%esi),%ebx
+ movl -12(%esi),%eax
+ rcrl %ebx
+ movl %edx,-8(%edi)
+ rcrl %eax
+ movl %ebx,-12(%edi)
+
+ movl -16(%esi),%edx
+ movl -20(%esi),%ebx
+ rcrl %edx
+ movl %eax,-16(%edi)
+ rcrl %ebx
+ movl %edx,-20(%edi)
+
+ movl -24(%esi),%eax
+ movl -28(%esi),%edx
+ rcrl %eax
+ movl %ebx,-24(%edi)
+ rcrl %edx
+ movl %eax,-28(%edi)
+
+ leal -32(%esi),%esi C use leal not to clobber carry
+ leal -32(%edi),%edi
+ decl %ebp
+ jnz L(Loop)
+
+L(Lend):
+ popl %ebp
+ sbbl %eax,%eax C save carry in %eax
+ andl $7,%ebp
+ jz L(Lend2)
+ addl %eax,%eax C restore carry from eax
+L(Loop2):
+ movl %edx,%ebx
+ movl (%esi),%edx
+ rcrl %edx
+ movl %ebx,(%edi)
+
+ leal -4(%esi),%esi C use leal not to clobber carry
+ leal -4(%edi),%edi
+ decl %ebp
+ jnz L(Loop2)
+
+ jmp L(L1)
+L(Lend2):
+ addl %eax,%eax C restore carry from eax
+L(L1): movl %edx,(%edi) C store last limb
+
+ movl $0,%eax
+ rcrl %eax
+
+ popl %ebp
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm
new file mode 100644
index 0000000..b11d767
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86/pentium/sqr_basecase.asm
@@ -0,0 +1,528 @@
+dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
+C product at around 20x20 limbs.
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Calculate src,size squared, storing the result in dst,2*size.
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %edx
+ movl PARAM_DST, %ecx
+
+ je L(two_limbs)
+
+ movl (%eax), %eax
+ ja L(three_or_more)
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ ret
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx dst
+ C edx size
+
+ pushl %ebp
+ pushl %edi
+
+ pushl %esi
+ pushl %ebx
+
+ movl %eax, %ebx
+ movl (%eax), %eax
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl %edx, %esi C dst[1]
+
+ movl 4(%ebx), %eax
+
+ mull %eax C src[1]^2
+
+ movl %eax, %edi C dst[2]
+ movl %edx, %ebp C dst[3]
+
+ movl (%ebx), %eax
+
+ mull 4(%ebx) C src[0]*src[1]
+
+ addl %eax, %esi
+ popl %ebx
+
+ adcl %edx, %edi
+
+ adcl $0, %ebp
+ addl %esi, %eax
+
+ adcl %edi, %edx
+ movl %eax, 4(%ecx)
+
+ adcl $0, %ebp
+ popl %esi
+
+ movl %edx, 8(%ecx)
+ movl %ebp, 12(%ecx)
+
+ popl %edi
+ popl %ebp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(three_or_more):
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx size
+
+ cmpl $4, %edx
+ pushl %ebx
+deflit(`FRAME',4)
+
+ movl PARAM_SRC, %ebx
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+ C eax src low limb
+ C ebx src
+ C ecx dst
+ C edx size
+
+ pushl %ebp
+ pushl %edi
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ movl 4(%ebx), %eax
+ xorl %ebp, %ebp
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl %edx, 12(%ecx)
+
+ movl 8(%ebx), %eax
+ pushl %esi C risk of cache bank clash
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl %edx, 20(%ecx)
+
+ movl (%ebx), %eax
+
+ mull 4(%ebx) C src[0] * src[1]
+
+ movl %eax, %esi
+ movl %edx, %edi
+
+ movl (%ebx), %eax
+
+ mull 8(%ebx) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %edx, %ebp
+
+ adcl $0, %ebp
+ movl 4(%ebx), %eax
+
+ mull 8(%ebx) C src[1] * src[2]
+
+ xorl %ebx, %ebx
+ addl %eax, %ebp
+
+ C eax
+ C ebx zero, will be dst[5]
+ C ecx dst
+ C edx dst[4]
+ C esi dst[1]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %esi, %esi
+
+ adcl %edi, %edi
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+ movl 4(%ecx), %eax
+
+ adcl $0, %ebx
+ addl %esi, %eax
+
+ movl %eax, 4(%ecx)
+ movl 8(%ecx), %eax
+
+ adcl %edi, %eax
+ movl 12(%ecx), %esi
+
+ adcl %ebp, %esi
+ movl 16(%ecx), %edi
+
+ movl %eax, 8(%ecx)
+ movl %esi, 12(%ecx)
+
+ adcl %edx, %edi
+ popl %esi
+
+ movl 20(%ecx), %eax
+ movl %edi, 16(%ecx)
+
+ popl %edi
+ popl %ebp
+
+ adcl %ebx, %eax C no carry out of this
+ popl %ebx
+
+ movl %eax, 20(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(8)
+L(four_or_more):
+ C eax src low limb
+ C ebx src
+ C ecx dst
+ C edx size
+ C esi
+ C edi
+ C ebp
+ C
+ C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+deflit(`FRAME',4)
+
+ pushl %edi
+FRAME_pushl()
+ pushl %esi
+FRAME_pushl()
+
+ pushl %ebp
+FRAME_pushl()
+ leal (%ecx,%edx,4), %edi C dst end of this mul1
+
+ leal (%ebx,%edx,4), %esi C src end
+ movl %ebx, %ebp C src
+
+ negl %edx C -size
+ xorl %ebx, %ebx C clear carry limb and carry flag
+
+ leal 1(%edx), %ecx C -(size-1)
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp src
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull (%ebp)
+
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(mul1)
+
+
+ C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+ C n=1..size-2.
+ C
+ C The last two products, which are the end corner of the product
+ C triangle, are handled separately to save looping overhead. These
+ C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
+ C If size is 4 then it's only these that need to be done.
+ C
+ C In the outer loop %esi is a constant, and %edi just advances by 1
+ C limb each time. The size of the operation decreases by 1 limb
+ C each time.
+
+ C eax
+ C ebx carry (needing carry flag added)
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ adcl $0, %ebx
+ movl PARAM_SIZE, %edx
+
+ movl %ebx, (%edi)
+ subl $4, %edx
+
+ negl %edx
+ jz L(corner)
+
+
+L(outer):
+ C ebx previous carry limb to store
+ C edx outer loop counter (negative)
+ C esi &src[size]
+ C edi dst, pointing at stored carry limb of previous loop
+
+ pushl %edx C new outer loop counter
+ leal -2(%edx), %ecx
+
+ movl %ebx, (%edi)
+ addl $4, %edi
+
+ addl $4, %ebp
+ xorl %ebx, %ebx C initial carry limb, clear carry flag
+
+L(inner):
+ C eax scratch
+ C ebx carry (needing carry flag added)
+ C ecx counter, negative
+ C edx scratch
+ C esi &src[size]
+ C edi dst end of this addmul
+ C ebp &src[j]
+
+ adcl $0, %ebx
+ movl (%esi,%ecx,4), %eax
+
+ mull (%ebp)
+
+ addl %ebx, %eax
+ movl (%edi,%ecx,4), %ebx
+
+ adcl $0, %edx
+ addl %eax, %ebx
+
+ movl %ebx, (%edi,%ecx,4)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(inner)
+
+
+ adcl $0, %ebx
+ popl %edx C outer loop counter
+
+ incl %edx
+ jnz L(outer)
+
+
+ movl %ebx, (%edi)
+
+L(corner):
+ C esi &src[size]
+ C edi &dst[2*size-4]
+
+ movl -8(%esi), %eax
+ movl -4(%edi), %ebx C risk of data cache bank clash here
+
+ mull -12(%esi) C src[size-2]*src[size-3]
+
+ addl %eax, %ebx
+ movl %edx, %ecx
+
+ adcl $0, %ecx
+ movl -4(%esi), %eax
+
+ mull -12(%esi) C src[size-1]*src[size-3]
+
+ addl %ecx, %eax
+ movl (%edi), %ecx
+
+ adcl $0, %edx
+ movl %ebx, -4(%edi)
+
+ addl %eax, %ecx
+ movl %edx, %ebx
+
+ adcl $0, %ebx
+ movl -4(%esi), %eax
+
+ mull -8(%esi) C src[size-1]*src[size-2]
+
+ movl %ecx, (%edi)
+ addl %eax, %ebx
+
+ adcl $0, %edx
+ movl PARAM_SIZE, %eax
+
+ negl %eax
+ movl %ebx, 4(%edi)
+
+ addl $1, %eax C -(size-1) and clear carry
+ movl %edx, 8(%edi)
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+L(lshift):
+ C eax counter, negative
+ C ebx next limb
+ C ecx
+ C edx
+ C esi
+ C edi &dst[2*size-4]
+ C ebp
+
+ movl 12(%edi,%eax,8), %ebx
+
+ rcll %ebx
+ movl 16(%edi,%eax,8), %ecx
+
+ rcll %ecx
+ movl %ebx, 12(%edi,%eax,8)
+
+ movl %ecx, 16(%edi,%eax,8)
+ incl %eax
+
+ jnz L(lshift)
+
+
+ adcl %eax, %eax C high bit out
+ movl PARAM_SRC, %esi
+
+ movl PARAM_SIZE, %ecx C risk of cache bank clash
+ movl %eax, 12(%edi) C dst most significant limb
+
+
+C -----------------------------------------------------------------------------
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+ movl (%esi), %eax C src[0]
+ leal (%esi,%ecx,4), %esi C src end
+
+ negl %ecx
+
+ mull %eax
+
+ movl %eax, 16(%edi,%ecx,8) C dst[0]
+ movl %edx, %ebx
+
+ addl $1, %ecx C size-1 and clear carry
+
+L(diag):
+ C eax scratch (low product)
+ C ebx carry limb
+ C ecx counter, negative
+ C edx scratch (high product)
+ C esi &src[size]
+ C edi &dst[2*size-4]
+ C ebp scratch (fetched dst limbs)
+
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %ebx
+
+ mull %eax
+
+ movl 16-4(%edi,%ecx,8), %ebp
+
+ addl %ebp, %ebx
+ movl 16(%edi,%ecx,8), %ebp
+
+ adcl %eax, %ebp
+ movl %ebx, 16-4(%edi,%ecx,8)
+
+ movl %ebp, 16(%edi,%ecx,8)
+ incl %ecx
+
+ movl %edx, %ebx
+ jnz L(diag)
+
+
+ adcl $0, %edx
+ movl 16-4(%edi), %eax C dst most significant limb
+
+ addl %eax, %edx
+ popl %ebp
+
+ movl %edx, 16-4(%edi)
+ popl %esi C risk of cache bank clash
+
+ popl %edi
+ popl %ebx
+
+ ret
+
+EPILOGUE()