aboutsummaryrefslogtreecommitdiff
path: root/vendor/gmp-6.3.0/mpn/x86_64/fastsse
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-06-21 23:36:36 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-06-21 23:42:26 +0200
commita89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
treeb23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86_64/fastsse
parent1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86_64/fastsse')
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/README22
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm311
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/com.asm175
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm254
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm166
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm300
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm185
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm182
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm173
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm193
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm183
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm201
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm204
13 files changed, 2549 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/README b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/README
new file mode 100644
index 0000000..5538b2d
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/README
@@ -0,0 +1,22 @@
+This directory contains code for x86-64 processors with fast
+implementations of SSE operations, hence the name "fastsse".
+
+Current processors that might benefit from this code are:
+
+ AMD K10
+ AMD Bulldozer/Piledriver/Steamroller/Excavator
+ Intel Nocona
+ Intel Nehalem/Westmere
+ Intel Sandybridge/Ivybridge
+ Intel Haswell/Broadwell
+ VIA Nano
+
+Current processors that do not benefit from this code are:
+
+ AMD K8
+ AMD Bobcat
+ Intel Atom
+
+Intel Conroe/Penryn is a border case; its handling of non-aligned
+128-bit memory operands is poor. VIA Nano also have poor handling of
+non-aligned operands.
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm
new file mode 100644
index 0000000..69027bc
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm
@@ -0,0 +1,311 @@
+dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 illop 1.0/1.0 N
+C AMD K10 0.85 illop Y/N
+C AMD bd1 1.39 ? 1.45 Y/N
+C AMD bd2 0.8-1.4 0.7-1.4 Y
+C AMD bd3
+C AMD bd4
+C AMD bobcat 1.97 ? 8.17 1.5/1.5 N
+C AMD jaguar 1.02 1.02 0.91/0.91 N
+C Intel P4 2.26 illop Y/N
+C Intel core 0.58 0.87 opt/0.74 Y
+C Intel NHM 0.64 1.14 opt/bad Y
+C Intel SBR 0.51 0.65 opt/opt Y
+C Intel IBR 0.50 0.64 opt/0.57 Y
+C Intel HWL 0.51 0.58 opt/opt Y
+C Intel BWL 0.52 0.64 opt/opt Y
+C Intel SKL 0.51 0.63 opt/opt Y
+C Intel atom 1.16 1.70 opt/opt Y
+C Intel SLM 1.02 1.52 N
+C VIA nano 1.09 1.10 opt/opt Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
+C instruction is better adapted to mpn_copyd's needs, we need to contort the
+C code to use it here.
+C
+C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken
+C from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity. We use
+C movaps, since it has the shortest coding.
+define(`movdqa', ``movaps'')
+
+ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_com)
+ FUNC_ENTRY(3)
+
+ cmp $COM_SSE_THRESHOLD, n
+ jbe L(bc)
+
+ pcmpeqb %xmm5, %xmm5 C set to 111...111
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(rp_aligned) C jump if rp aligned
+
+ mov (up), %r8
+ lea 8(up), up
+ not %r8
+ mov %r8, (rp)
+ lea 8(rp), rp
+ dec n
+
+L(rp_aligned):
+ test $8, R8(up)
+ jnz L(uent)
+
+ifelse(eval(COM_SSE_THRESHOLD >= 8),1,
+` sub $8, n',
+` jmp L(am)')
+
+ ALIGN(16)
+L(atop):movdqa 0(up), %xmm0
+ movdqa 16(up), %xmm1
+ movdqa 32(up), %xmm2
+ movdqa 48(up), %xmm3
+ lea 64(up), up
+ pxor %xmm5, %xmm0
+ pxor %xmm5, %xmm1
+ pxor %xmm5, %xmm2
+ pxor %xmm5, %xmm3
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+L(am): sub $8, n
+ jnc L(atop)
+
+ test $4, R8(n)
+ jz 1f
+ movdqa (up), %xmm0
+ movdqa 16(up), %xmm1
+ lea 32(up), up
+ pxor %xmm5, %xmm0
+ pxor %xmm5, %xmm1
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa (up), %xmm0
+ lea 16(up), up
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ not %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+L(uent):
+C Code handling up - rp = 8 (mod 16)
+
+C FIXME: The code below only handles overlap if it is close to complete, or
+C quite separate: up-rp < 5 or up-up > 15 limbs
+ lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES
+ sub rp, %rax
+ cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES
+ jbe L(bc) C deflect to plain loop
+
+ sub $16, n
+ jc L(uend)
+
+ movdqa 120(up), %xmm3
+
+ sub $16, n
+ jmp L(um)
+
+ ALIGN(16)
+L(utop):movdqa 120(up), %xmm3
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, -128(rp)
+ sub $16, n
+L(um): movdqa 104(up), %xmm2
+ palignr($8, %xmm2, %xmm3)
+ movdqa 88(up), %xmm1
+ pxor %xmm5, %xmm3
+ movdqa %xmm3, 112(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa 72(up), %xmm0
+ pxor %xmm5, %xmm2
+ movdqa %xmm2, 96(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa 56(up), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, 80(rp)
+ palignr($8, %xmm3, %xmm0)
+ movdqa 40(up), %xmm2
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, 64(rp)
+ palignr($8, %xmm2, %xmm3)
+ movdqa 24(up), %xmm1
+ pxor %xmm5, %xmm3
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa 8(up), %xmm0
+ pxor %xmm5, %xmm2
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa -8(up), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm3, %xmm0)
+ lea 128(up), up
+ lea 128(rp), rp
+ jnc L(utop)
+
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, -128(rp)
+
+L(uend):test $8, R8(n)
+ jz 1f
+ movdqa 56(up), %xmm3
+ movdqa 40(up), %xmm2
+ palignr($8, %xmm2, %xmm3)
+ movdqa 24(up), %xmm1
+ pxor %xmm5, %xmm3
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa 8(up), %xmm0
+ pxor %xmm5, %xmm2
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa -8(up), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm3, %xmm0)
+ lea 64(up), up
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, (rp)
+ lea 64(rp), rp
+
+1: test $4, R8(n)
+ jz 1f
+ movdqa 24(up), %xmm1
+ movdqa 8(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movdqa -8(up), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm3, %xmm0)
+ lea 32(up), up
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, (rp)
+ lea 32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa 8(up), %xmm0
+ movdqa -8(up), %xmm3
+ palignr($8, %xmm3, %xmm0)
+ lea 16(up), up
+ pxor %xmm5, %xmm0
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ not %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+C Basecase code. Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc): lea -8(rp), rp
+ sub $4, R32(n)
+ jc L(end)
+
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+` ALIGN(16)')
+L(top): mov (up), %r8
+ mov 8(up), %r9
+ lea 32(rp), rp
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ not %r8
+ not %r9
+ not %r10
+ not %r11
+ mov %r8, -24(rp)
+ mov %r9, -16(rp)
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+` sub $4, R32(n)')
+ mov %r10, -8(rp)
+ mov %r11, (rp)
+ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1,
+` jnc L(top)')
+
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ not %r8
+ mov %r8, 8(rp)
+ lea 8(rp), rp
+ lea 8(up), up
+1: test $2, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov 8(up), %r9
+ not %r8
+ not %r9
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+1: FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com.asm
new file mode 100644
index 0000000..c867222
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com.asm
@@ -0,0 +1,175 @@
+dnl AMD64 mpn_com optimised for CPUs with fast SSE.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 2.0 N
+C AMD K10 0.85 1.3 Y/N
+C AMD bull 1.40 1.40 Y
+C AMD pile 0.9-1.4 0.9-1.4 Y
+C AMD steam
+C AMD excavator
+C AMD bobcat 3.1 3.1 N
+C AMD jaguar 0.91 0.91 opt/opt Y
+C Intel P4 2.28 illop Y
+C Intel core2 1.02 1.02 N
+C Intel NHM 0.53 0.68 Y
+C Intel SBR 0.51 0.75 opt/0.65 Y/N
+C Intel IBR 0.50 0.57 opt/opt Y
+C Intel HWL 0.51 0.64 opt/0.58 Y
+C Intel BWL 0.61 0.65 0.57/opt Y
+C Intel atom 3.68 3.68 N
+C Intel SLM 1.09 1.35 N
+C VIA nano 1.17 5.09 Y/N
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU. This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source). It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_com)
+ FUNC_ENTRY(3)
+
+IFDOS(` add $-56, %rsp ')
+IFDOS(` movdqa %xmm6, (%rsp) ')
+IFDOS(` movdqa %xmm7, 16(%rsp) ')
+
+ pcmpeqb %xmm7, %xmm7 C set to 111...111
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(ali) C jump if rp aligned
+ mov (up), %rax
+ lea 8(up), up
+ not %rax
+ mov %rax, (rp)
+ lea 8(rp), rp
+ dec n
+
+ sub $14, n
+ jc L(sma)
+
+ ALIGN(16)
+L(top): movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ movdqu 64(up), %xmm4
+ movdqu 80(up), %xmm5
+ movdqu 96(up), %xmm6
+ lea 112(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ pxor %xmm7, %xmm2
+ pxor %xmm7, %xmm3
+ pxor %xmm7, %xmm4
+ pxor %xmm7, %xmm5
+ pxor %xmm7, %xmm6
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ movdqa %xmm4, 64(rp)
+ movdqa %xmm5, 80(rp)
+ movdqa %xmm6, 96(rp)
+ lea 112(rp), rp
+L(ali): sub $14, n
+ jnc L(top)
+
+L(sma): add $14, n
+ test $8, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ lea 64(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ pxor %xmm7, %xmm2
+ pxor %xmm7, %xmm3
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ lea 32(up), up
+ pxor %xmm7, %xmm0
+ pxor %xmm7, %xmm1
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ lea 16(up), up
+ pxor %xmm7, %xmm0
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov (up), %rax
+ not %rax
+ mov %rax, (rp)
+1:
+L(don):
+IFDOS(` movdqa (%rsp), %xmm6 ')
+IFDOS(` movdqa 16(%rsp), %xmm7 ')
+IFDOS(` add $56, %rsp ')
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm
new file mode 100644
index 0000000..fac6f8a
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm
@@ -0,0 +1,254 @@
+dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl Copyright 2012, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 illop 1.0/1.0 N
+C AMD K10 0.85 illop Y/N
+C AMD bull 0.70 0.70 Y
+C AMD pile 0.68 0.68 Y
+C AMD steam
+C AMD excavator
+C AMD bobcat 1.97 8.24 1.5/1.5 N
+C AMD jaguar 0.77 0.89 0.65/opt N/Y
+C Intel P4 2.26 illop Y/N
+C Intel core 0.52 0.80 opt/opt Y
+C Intel NHM 0.52 0.64 opt/opt Y
+C Intel SBR 0.51 0.51 opt/opt Y
+C Intel IBR 0.50 0.50 opt/opt Y
+C Intel HWL 0.50 0.51 opt/opt Y
+C Intel BWL 0.55 0.55 opt/opt Y
+C Intel atom 1.16 1.66 opt/opt Y
+C Intel SLM 1.02 1.04 opt/opt Y
+C VIA nano 1.08 1.06 opt/opt Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).
+C
+C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
+C taken from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity. We use
+C movaps, since it has the shortest coding.
+define(`movdqa', ``movaps'')
+
+ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+
+ lea -8(up,n,8), up
+ lea -8(rp,n,8), rp
+
+ cmp $COPYD_SSE_THRESHOLD, n
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jnz L(rp_aligned) C jump if rp aligned
+
+ mov (up), %rax C copy one limb
+ mov %rax, (rp)
+ lea -8(up), up
+ lea -8(rp), rp
+ dec n
+
+L(rp_aligned):
+ test $8, R8(up)
+ jz L(uent)
+
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+` sub $8, n',
+` jmp L(am)')
+
+ ALIGN(16)
+L(atop):movdqa -8(up), %xmm0
+ movdqa -24(up), %xmm1
+ movdqa -40(up), %xmm2
+ movdqa -56(up), %xmm3
+ lea -64(up), up
+ movdqa %xmm0, -8(rp)
+ movdqa %xmm1, -24(rp)
+ movdqa %xmm2, -40(rp)
+ movdqa %xmm3, -56(rp)
+ lea -64(rp), rp
+L(am): sub $8, n
+ jnc L(atop)
+
+ test $4, R8(n)
+ jz 1f
+ movdqa -8(up), %xmm0
+ movdqa -24(up), %xmm1
+ lea -32(up), up
+ movdqa %xmm0, -8(rp)
+ movdqa %xmm1, -24(rp)
+ lea -32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa -8(up), %xmm0
+ lea -16(up), up
+ movdqa %xmm0, -8(rp)
+ lea -16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+L(uent):sub $16, n
+ movdqa (up), %xmm0
+ jc L(uend)
+
+ ALIGN(16)
+L(utop):sub $16, n
+ movdqa -16(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -8(rp)
+ movdqa -32(up), %xmm2
+ palignr($8, %xmm2, %xmm1)
+ movdqa %xmm1, -24(rp)
+ movdqa -48(up), %xmm3
+ palignr($8, %xmm3, %xmm2)
+ movdqa %xmm2, -40(rp)
+ movdqa -64(up), %xmm0
+ palignr($8, %xmm0, %xmm3)
+ movdqa %xmm3, -56(rp)
+ movdqa -80(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -72(rp)
+ movdqa -96(up), %xmm2
+ palignr($8, %xmm2, %xmm1)
+ movdqa %xmm1, -88(rp)
+ movdqa -112(up), %xmm3
+ palignr($8, %xmm3, %xmm2)
+ movdqa %xmm2, -104(rp)
+ movdqa -128(up), %xmm0
+ palignr($8, %xmm0, %xmm3)
+ movdqa %xmm3, -120(rp)
+ lea -128(up), up
+ lea -128(rp), rp
+ jnc L(utop)
+
+L(uend):test $8, R8(n)
+ jz 1f
+ movdqa -16(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -8(rp)
+ movdqa -32(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, -24(rp)
+ movdqa -48(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -40(rp)
+ movdqa -64(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, -56(rp)
+ lea -64(up), up
+ lea -64(rp), rp
+
+1: test $4, R8(n)
+ jz 1f
+ movdqa -16(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -8(rp)
+ movdqa -32(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, -24(rp)
+ lea -32(up), up
+ lea -32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa -16(up), %xmm1
+ palignr($8, %xmm1, %xmm0)
+ movdqa %xmm0, -8(rp)
+ lea -16(up), up
+ lea -16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+C Basecase code. Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc): sub $4, R32(n)
+ jc L(end)
+
+ ALIGN(16)
+L(top): mov (up), %r8
+ mov -8(up), %r9
+ lea -32(rp), rp
+ mov -16(up), %r10
+ mov -24(up), %r11
+ lea -32(up), up
+ mov %r8, 32(rp)
+ mov %r9, 24(rp)
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+` sub $4, R32(n)')
+ mov %r10, 16(rp)
+ mov %r11, 8(rp)
+ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1,
+` jnc L(top)')
+
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+ lea -8(rp), rp
+ lea -8(up), up
+1: test $2, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov -8(up), %r9
+ mov %r8, (rp)
+ mov %r9, -8(rp)
+1: FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm
new file mode 100644
index 0000000..b3c4706
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm
@@ -0,0 +1,166 @@
+dnl AMD64 mpn_copyd optimised for CPUs with fast SSE.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9
+C AMD K10 0.85 1.64 Y/N
+C AMD bull 1.4 1.4 Y
+C AMD pile 0.68 0.98 Y/N
+C AMD steam
+C AMD excavator
+C AMD bobcat
+C AMD jaguar 0.65 1.02 opt/0.93 Y/N
+C Intel P4 2.3 2.3 Y
+C Intel core 1.0 1.0 0.52/0.80 N
+C Intel NHM 0.5 0.67 Y
+C Intel SBR 0.51 0.75 opt/0.54 Y/N
+C Intel IBR 0.50 0.57 opt/0.50 Y
+C Intel HWL 0.50 0.57 opt/0.51 Y
+C Intel BWL 0.55 0.62 opt/0.55 Y
+C Intel atom
+C Intel SLM 1.02 1.27 opt/1.04 Y/N
+C VIA nano 1.16 5.16 Y/N
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU. This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source). It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`movdqu', lddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+
+ test n, n
+ jz L(don)
+
+ lea -16(rp,n,8), rp
+ lea -16(up,n,8), up
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(ali) C jump if rp aligned
+ mov 8(up), %rax
+ lea -8(up), up
+ mov %rax, 8(rp)
+ lea -8(rp), rp
+ dec n
+
+L(ali): sub $16, n
+ jc L(sma)
+
+IFDOS(` add $-56, %rsp ')
+IFDOS(` movdqa %xmm6, (%rsp) ')
+IFDOS(` movdqa %xmm7, 16(%rsp) ')
+
+ ALIGN(16)
+L(top): movdqu (up), %xmm0
+ movdqu -16(up), %xmm1
+ movdqu -32(up), %xmm2
+ movdqu -48(up), %xmm3
+ movdqu -64(up), %xmm4
+ movdqu -80(up), %xmm5
+ movdqu -96(up), %xmm6
+ movdqu -112(up), %xmm7
+ lea -128(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, -16(rp)
+ movdqa %xmm2, -32(rp)
+ movdqa %xmm3, -48(rp)
+ movdqa %xmm4, -64(rp)
+ movdqa %xmm5, -80(rp)
+ movdqa %xmm6, -96(rp)
+ movdqa %xmm7, -112(rp)
+ lea -128(rp), rp
+ sub $16, n
+ jnc L(top)
+
+IFDOS(` movdqa (%rsp), %xmm6 ')
+IFDOS(` movdqa 16(%rsp), %xmm7 ')
+IFDOS(` add $56, %rsp ')
+
+L(sma): test $8, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu -16(up), %xmm1
+ movdqu -32(up), %xmm2
+ movdqu -48(up), %xmm3
+ lea -64(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, -16(rp)
+ movdqa %xmm2, -32(rp)
+ movdqa %xmm3, -48(rp)
+ lea -64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu -16(up), %xmm1
+ lea -32(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, -16(rp)
+ lea -32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ lea -16(up), up
+ movdqa %xmm0, (rp)
+ lea -16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov 8(up), %r8
+ mov %r8, 8(rp)
+1:
+L(don): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm
new file mode 100644
index 0000000..9876a47
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm
@@ -0,0 +1,300 @@
+dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 2.0 illop 1.0/1.0 N
+C AMD K10 0.85 illop Y/N
+C AMD bd1 0.70 0.66 Y
+C AMD bd2 0.68 0.66 Y
+C AMD bd3 ? ?
+C AMD bd4 ? ?
+C AMD bt1 1.97 8.16 1.5/1.5 N
+C AMD bt2 0.77 0.93 0.65/opt N/Y
+C AMD zn1 ? ?
+C AMD zn2 ? ?
+C Intel P4 2.26 illop Y/N
+C Intel CNR 0.52 0.64 opt/opt Y
+C Intel NHM 0.52 0.71 0.50/0.67 N
+C Intel SBR 0.51 0.54 opt/0.51 Y
+C Intel IBR 0.50 0.54 opt/opt Y
+C Intel HWL 0.50 0.51 opt/opt Y
+C Intel BWL 0.55 0.55 opt/opt Y
+C Intel atom 1.16 1.61 opt/opt Y
+C Intel SLM 1.02 1.07 opt/opt Y
+C VIA nano 1.09 1.08 opt/opt Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
+C instruction is better adapted to mpn_copyd's needs, we need to contort the
+C code to use it here.
+C
+C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
+C taken from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity. We use
+C movaps, since it has the shortest coding.
+dnl define(`movdqa', ``movaps'')
+
+ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_copyi)
+ FUNC_ENTRY(3)
+
+ cmp $COPYI_SSE_THRESHOLD, n
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(rp_aligned) C jump if rp aligned
+
+ movsq C copy one limb
+ dec n
+
+L(rp_aligned):
+ test $8, R8(up)
+ jnz L(uent)
+
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+` sub $8, n',
+` jmp L(am)')
+
+ ALIGN(16)
+L(atop):movdqa 0(up), %xmm0
+ movdqa 16(up), %xmm1
+ movdqa 32(up), %xmm2
+ movdqa 48(up), %xmm3
+ lea 64(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+L(am): sub $8, n
+ jnc L(atop)
+
+ test $4, R8(n)
+ jz 1f
+ movdqa (up), %xmm0
+ movdqa 16(up), %xmm1
+ lea 32(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa (up), %xmm0
+ lea 16(up), up
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+L(uent):
+C Code handling up - rp = 8 (mod 16)
+
+ cmp $16, n
+ jc L(ued0)
+
+IFDOS(` add $-56, %rsp ')
+IFDOS(` movdqa %xmm6, (%rsp) ')
+IFDOS(` movdqa %xmm7, 16(%rsp) ')
+IFDOS(` movdqa %xmm8, 32(%rsp) ')
+
+ movaps 120(up), %xmm7
+ movaps 104(up), %xmm6
+ movaps 88(up), %xmm5
+ movaps 72(up), %xmm4
+ movaps 56(up), %xmm3
+ movaps 40(up), %xmm2
+ lea 128(up), up
+ sub $32, n
+ jc L(ued1)
+
+ ALIGN(16)
+L(utop):movaps -104(up), %xmm1
+ sub $16, n
+ movaps -120(up), %xmm0
+ palignr($8, %xmm6, %xmm7)
+ movaps -136(up), %xmm8
+ movdqa %xmm7, 112(rp)
+ palignr($8, %xmm5, %xmm6)
+ movaps 120(up), %xmm7
+ movdqa %xmm6, 96(rp)
+ palignr($8, %xmm4, %xmm5)
+ movaps 104(up), %xmm6
+ movdqa %xmm5, 80(rp)
+ palignr($8, %xmm3, %xmm4)
+ movaps 88(up), %xmm5
+ movdqa %xmm4, 64(rp)
+ palignr($8, %xmm2, %xmm3)
+ movaps 72(up), %xmm4
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movaps 56(up), %xmm3
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movaps 40(up), %xmm2
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm8, %xmm0)
+ lea 128(up), up
+ movdqa %xmm0, (rp)
+ lea 128(rp), rp
+ jnc L(utop)
+
+L(ued1):movaps -104(up), %xmm1
+ movaps -120(up), %xmm0
+ movaps -136(up), %xmm8
+ palignr($8, %xmm6, %xmm7)
+ movdqa %xmm7, 112(rp)
+ palignr($8, %xmm5, %xmm6)
+ movdqa %xmm6, 96(rp)
+ palignr($8, %xmm4, %xmm5)
+ movdqa %xmm5, 80(rp)
+ palignr($8, %xmm3, %xmm4)
+ movdqa %xmm4, 64(rp)
+ palignr($8, %xmm2, %xmm3)
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm8, %xmm0)
+ movdqa %xmm0, (rp)
+ lea 128(rp), rp
+
+IFDOS(` movdqa (%rsp), %xmm6 ')
+IFDOS(` movdqa 16(%rsp), %xmm7 ')
+IFDOS(` movdqa 32(%rsp), %xmm8 ')
+IFDOS(` add $56, %rsp ')
+
+L(ued0):test $8, R8(n)
+ jz 1f
+ movaps 56(up), %xmm3
+ movaps 40(up), %xmm2
+ movaps 24(up), %xmm1
+ movaps 8(up), %xmm0
+ movaps -8(up), %xmm4
+ palignr($8, %xmm2, %xmm3)
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm4, %xmm0)
+ lea 64(up), up
+ movdqa %xmm0, (rp)
+ lea 64(rp), rp
+
+1: test $4, R8(n)
+ jz 1f
+ movaps 24(up), %xmm1
+ movaps 8(up), %xmm0
+ palignr($8, %xmm0, %xmm1)
+ movaps -8(up), %xmm3
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm3, %xmm0)
+ lea 32(up), up
+ movdqa %xmm0, (rp)
+ lea 32(rp), rp
+
+1: test $2, R8(n)
+ jz 1f
+ movdqa 8(up), %xmm0
+ movdqa -8(up), %xmm3
+ palignr($8, %xmm3, %xmm0)
+ lea 16(up), up
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+
+1: test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+
+1: FUNC_EXIT()
+ ret
+
+C Basecase code. Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc): lea -8(rp), rp
+ sub $4, R32(n)
+ jc L(end)
+
+ ALIGN(16)
+L(top): mov (up), %r8
+ mov 8(up), %r9
+ lea 32(rp), rp
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ mov %r8, -24(rp)
+ mov %r9, -16(rp)
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+` sub $4, R32(n)')
+ mov %r10, -8(rp)
+ mov %r11, (rp)
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+` jnc L(top)')
+
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, 8(rp)
+ lea 8(rp), rp
+ lea 8(up), up
+1: test $2, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov 8(up), %r9
+ mov %r8, 8(rp)
+ mov %r9, 16(rp)
+1: FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm
new file mode 100644
index 0000000..97f7865
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm
@@ -0,0 +1,185 @@
+dnl AMD64 mpn_copyi optimised for CPUs with fast SSE.
+
+dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation,
+dnl Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9
+C AMD K10 0.85 1.64 Y/N
+C AMD bull 1.4 1.4 N
+C AMD pile 0.77 0.93 N
+C AMD steam ? ?
+C AMD excavator ? ?
+C AMD bobcat
+C AMD jaguar 0.65 1.02 opt/0.93 Y/N
+C Intel P4 2.3 2.3 Y
+C Intel core 1.0 1.0 0.52/0.64 N
+C Intel NHM 0.5 0.67 Y
+C Intel SBR 0.51 0.75 opt/0.54 Y/N
+C Intel IBR 0.50 0.57 opt/0.54 Y
+C Intel HWL 0.50 0.57 opt/0.51 Y
+C Intel BWL 0.55 0.62 opt/0.55 Y
+C Intel atom
+C Intel SLM 1.02 1.27 opt/1.07 Y/N
+C VIA nano 1.16 5.16 Y/N
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We can always write using
+C aligned 16-byte operations, we read with both aligned and unaligned 16-byte
+C operations.
+
+C Instead of having separate loops for reading aligned and unaligned, we read
+C using MOVDQU. This seems to work great except for core2; there performance
+C doubles when reading using MOVDQA (for aligned source). It is unclear how to
+C best handle the unaligned case there.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`movdqu', lddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_copyi)
+ FUNC_ENTRY(3)
+
+ cmp $3, n C NB: bc code below assumes this limit
+ jc L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(ali) C jump if rp aligned
+ movsq C copy single limb
+ dec n
+
+L(ali): sub $16, n
+ jc L(sma)
+
+IFDOS(` add $-56, %rsp ')
+IFDOS(` movdqa %xmm6, (%rsp) ')
+IFDOS(` movdqa %xmm7, 16(%rsp) ')
+
+ ALIGN(16)
+L(top): movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ movdqu 64(up), %xmm4
+ movdqu 80(up), %xmm5
+ movdqu 96(up), %xmm6
+ movdqu 112(up), %xmm7
+ lea 128(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ movdqa %xmm4, 64(rp)
+ movdqa %xmm5, 80(rp)
+ movdqa %xmm6, 96(rp)
+ movdqa %xmm7, 112(rp)
+ lea 128(rp), rp
+ sub $16, n
+ jnc L(top)
+
+IFDOS(` movdqa (%rsp), %xmm6 ')
+IFDOS(` movdqa 16(%rsp), %xmm7 ')
+IFDOS(` add $56, %rsp ')
+
+L(sma): test $8, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ movdqu 32(up), %xmm2
+ movdqu 48(up), %xmm3
+ lea 64(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa %xmm3, 48(rp)
+ lea 64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ movdqu 16(up), %xmm1
+ lea 32(up), up
+ movdqa %xmm0, (rp)
+ movdqa %xmm1, 16(rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ movdqu (up), %xmm0
+ lea 16(up), up
+ movdqa %xmm0, (rp)
+ lea 16(rp), rp
+ ALIGN(16)
+1:
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+1:
+ FUNC_EXIT()
+ ret
+
+C Basecase code. Needed for good small operands speed, not for correctness as
+C the above code is currently written. The commented-out lines need to be
+C reinstated if this code is to be used for n > 3, and then the post loop
+C offsets need fixing.
+
+L(bc): sub $2, n
+ jc L(end)
+ ALIGN(16)
+1: mov (up), %rax
+ mov 8(up), %rcx
+dnl lea 16(up), up
+ mov %rax, (rp)
+ mov %rcx, 8(rp)
+dnl lea 16(rp), rp
+dnl sub $2, n
+dnl jnc 1b
+
+ test $1, R8(n)
+ jz L(ret)
+ mov 16(up), %rax
+ mov %rax, 16(rp)
+L(ret): FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm
new file mode 100644
index 0000000..a05e850
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm
@@ -0,0 +1,182 @@
+dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 3 3 2.35 no, use shl/shr
+C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
+C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
+C AMD bobcat 3.17 3.17 yes, bad for n < 20
+C Intel P4 4.67 4.67 2.7 no, slow movdqu
+C Intel core2 2.15 2.15 1.25 no, use shld/shrd
+C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
+C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
+C Intel atom 11.7 11.7 4.5 no
+C VIA nano 5.7 5.95 2.0 no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible. The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C * Could 2-limb wind-down code be simplified?
+C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ cmp $3, n
+ jle L(bc)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ jz L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea 1(n), %r8d
+
+ and $6, R32(%r8)
+ jz L(ba0)
+ cmp $4, R32(%r8)
+ jz L(ba4)
+ jc L(ba2)
+L(ba6): add $-4, n
+ jmp L(i56)
+L(ba0): add $-6, n
+ jmp L(i70)
+L(ba4): add $-2, n
+ jmp L(i34)
+L(ba2): add $-8, n
+ jle L(end)
+
+ ALIGN(16)
+L(top): movdqu 40(ap,n,8), %xmm1
+ movdqu 48(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, 48(rp,n,8)
+L(i70):
+ movdqu 24(ap,n,8), %xmm1
+ movdqu 32(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, 32(rp,n,8)
+L(i56):
+ movdqu 8(ap,n,8), %xmm1
+ movdqu 16(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, 16(rp,n,8)
+L(i34):
+ movdqu -8(ap,n,8), %xmm1
+ movdqu (ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp,n,8)
+ sub $8, n
+ jg L(top)
+
+L(end): test $1, R8(n)
+ jnz L(end8)
+
+ movdqu (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+
+C Basecase
+ ALIGN(16)
+L(bc): dec R32(n)
+ jz L(end8)
+
+ movq (ap,n,8), %xmm1
+ movq -8(ap,n,8), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movq %xmm0, (rp,n,8)
+ sub $2, R32(n)
+ jl L(end8)
+ movq 8(ap), %xmm1
+ movq (ap), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movq %xmm0, 8(rp)
+
+L(end8):movq (ap), %xmm0
+ psllq %xmm4, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm
new file mode 100644
index 0000000..6a17b93
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm
@@ -0,0 +1,173 @@
+dnl AMD64 mpn_lshift optimised for CPUs with fast SSE.
+
+dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb good
+C 16-byte aligned 16-byte unaligned for cpu?
+C AMD K8,K9 ? ?
+C AMD K10 1.68 (1.45) 1.75 (1.49) Y
+C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
+C AMD bobcat 4 4
+C Intel P4 3 (2.7) 3 (2.7) Y
+C Intel core2 2.05 (1.67) 2.55 (1.75)
+C Intel NHM 2.05 (1.75) 2.09 (2)
+C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y
+C Intel atom ? ?
+C VIA nano 2.25 (2) 2.5 (2) Y
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true. The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C (1) The unaligned case makes many reads.
+C (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshift)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ cmp $2, n
+ jle L(le2)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea (ap,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(aent)
+ jmp L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).
+
+ ALIGN(16)
+L(utop):movdqa -8(ap,n,8), %xmm0
+ movq (ap,n,8), %xmm1
+ punpcklqdq 8(ap,n,8), %xmm1
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp,n,8)
+L(uent):sub $2, n
+ ja L(utop)
+
+ jne L(end8)
+
+ movq (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ punpcklqdq 8(ap), %xmm1
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+C *****************************************************************************
+
+C Handle the case when ap = rp (mod 16).
+
+ ALIGN(16)
+L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
+ movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
+ punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp,n,8)
+L(aent):
+ sub $2, n
+ ja L(atop)
+ jne L(end8)
+
+ movdqa (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+C *****************************************************************************
+
+ ALIGN(16)
+L(le2): jne L(end8)
+
+ movq 8(ap), %xmm0
+ movq (ap), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movq %xmm0, 8(rp)
+
+L(end8):movq (ap), %xmm0
+ psllq %xmm4, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
new file mode 100644
index 0000000..8250910
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm
@@ -0,0 +1,193 @@
+dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 3 3 ? no, use shl/shr
+C AMD K10 1.8-2.0 1.8-2.0 ? yes
+C AMD bd1 1.9 1.9 ? yes
+C AMD bobcat 3.67 3.67 yes, bad for n < 20
+C Intel P4 4.75 4.75 ? no, slow movdqu
+C Intel core2 2.27 2.27 ? no, use shld/shrd
+C Intel NHM 2.15 2.15 ? no, use shld/shrd
+C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6
+C Intel atom 12.9 12.9 ? no
+C VIA nano 6.18 6.44 ? no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible. The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C * Could 2-limb wind-down code be simplified?
+C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ pcmpeqb %xmm3, %xmm3 C set to 111...111
+
+ cmp $3, n
+ jle L(bc)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ jz L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea 1(n), %r8d
+
+ and $6, R32(%r8)
+ jz L(ba0)
+ cmp $4, R32(%r8)
+ jz L(ba4)
+ jc L(ba2)
+L(ba6): add $-4, n
+ jmp L(i56)
+L(ba0): add $-6, n
+ jmp L(i70)
+L(ba4): add $-2, n
+ jmp L(i34)
+L(ba2): add $-8, n
+ jle L(end)
+
+ ALIGN(16)
+L(top): movdqu 40(ap,n,8), %xmm1
+ movdqu 48(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, 48(rp,n,8)
+L(i70):
+ movdqu 24(ap,n,8), %xmm1
+ movdqu 32(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, 32(rp,n,8)
+L(i56):
+ movdqu 8(ap,n,8), %xmm1
+ movdqu 16(ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, 16(rp,n,8)
+L(i34):
+ movdqu -8(ap,n,8), %xmm1
+ movdqu (ap,n,8), %xmm0
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, (rp,n,8)
+ sub $8, n
+ jg L(top)
+
+L(end): test $1, R8(n)
+ jnz L(end8)
+
+ movdqu (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+
+C Basecase
+ ALIGN(16)
+L(bc): dec R32(n)
+ jz L(end8)
+
+ movq (ap,n,8), %xmm1
+ movq -8(ap,n,8), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movq %xmm0, (rp,n,8)
+ sub $2, R32(n)
+ jl L(end8)
+ movq 8(ap), %xmm1
+ movq (ap), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm3, %xmm0
+ movq %xmm0, 8(rp)
+
+L(end8):movq (ap), %xmm0
+ psllq %xmm4, %xmm0
+ pxor %xmm3, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm
new file mode 100644
index 0000000..a616075
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm
@@ -0,0 +1,183 @@
+dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
+
+dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb good
+C 16-byte aligned 16-byte unaligned for cpu?
+C AMD K8,K9 ? ?
+C AMD K10 1.85 (1.635) 1.9 (1.67) Y
+C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
+C AMD bobcat 4.5 4.5
+C Intel P4 3.6 (3.125) 3.6 (3.125) Y
+C Intel core2 2.05 (1.67) 2.55 (1.75)
+C Intel NHM 2.05 (1.875) 2.6 (2.25)
+C Intel SBR 1.55 (1.44) 2 (1.57) Y
+C Intel atom ? ?
+C VIA nano 2.5 (2.5) 2.5 (2.5) Y
+
+C We try to do as many 16-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. We always write using
+C 16-byte operations, we read with both 8-byte and 16-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true. The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C (1) The unaligned case makes too many reads.
+C (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_lshiftc)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov -8(ap,n,8), %rax
+ shr R8(%rcx), %rax
+
+ pcmpeqb %xmm2, %xmm2 C set to 111...111
+
+ cmp $2, n
+ jle L(le2)
+
+ lea (rp,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq -8(ap,n,8), %xmm0
+ movq -16(ap,n,8), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movq %xmm0, -8(rp,n,8)
+ dec n
+
+L(rp_aligned):
+ lea (ap,n,8), R32(%rcx)
+ test $8, R8(%rcx)
+ je L(aent)
+ jmp L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).
+
+ ALIGN(16)
+L(utop):movq (ap,n,8), %xmm1
+ punpcklqdq 8(ap,n,8), %xmm1
+ movdqa -8(ap,n,8), %xmm0
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movdqa %xmm0, (rp,n,8)
+L(uent):sub $2, n
+ ja L(utop)
+
+ jne L(end8)
+
+ movq (ap), %xmm1
+ pxor %xmm0, %xmm0
+ punpcklqdq %xmm1, %xmm0
+ punpcklqdq 8(ap), %xmm1
+ psllq %xmm4, %xmm1
+ psrlq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+C *****************************************************************************
+
+C Handle the case when ap = rp (mod 16).
+
+ ALIGN(16)
+L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
+ movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
+ punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movdqa %xmm0, (rp,n,8)
+L(aent):sub $2, n
+ ja L(atop)
+
+ jne L(end8)
+
+ movdqa (ap), %xmm0
+ pxor %xmm1, %xmm1
+ punpcklqdq %xmm0, %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movdqa %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+C *****************************************************************************
+
+ ALIGN(16)
+L(le2): jne L(end8)
+
+ movq 8(ap), %xmm0
+ movq (ap), %xmm1
+ psllq %xmm4, %xmm0
+ psrlq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ pxor %xmm2, %xmm0
+ movq %xmm0, 8(rp)
+
+L(end8):movq (ap), %xmm0
+ psllq %xmm4, %xmm0
+ pxor %xmm2, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm
new file mode 100644
index 0000000..1e270b1
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm
@@ -0,0 +1,201 @@
+dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 3 3 2.35 no, use shl/shr
+C AMD K10 1.5-1.8 1.5-1.8 1.33 yes
+C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes
+C AMD bobcat 3.17 3.17 yes, bad for n < 20
+C Intel P4 4.67 4.67 2.7 no, slow movdqu
+C Intel core2 2.15 2.15 1.25 no, use shld/shrd
+C Intel NHM 1.66 1.66 1.25 no, use shld/shrd
+C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6
+C Intel atom 11.7 11.7 4.5 no
+C VIA nano 5.7 5.95 2.0 no, slow movdqu
+
+C We try to do as many aligned 16-byte operations as possible. The top-most
+C and bottom-most writes might need 8-byte operations.
+C
+C This variant rely on fast load movdqu, and uses it even for aligned operands,
+C in order to avoid the need for two separate loops.
+C
+C TODO
+C * Could 2-limb wind-down code be simplified?
+C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts
+C for other affected CPUs.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`ap', `%rsi')
+define(`n', `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_rshift)
+ FUNC_ENTRY(4)
+ movd R32(%rcx), %xmm4
+ mov $64, R32(%rax)
+ sub R32(%rcx), R32(%rax)
+ movd R32(%rax), %xmm5
+
+ neg R32(%rcx)
+ mov (ap), %rax
+ shl R8(%rcx), %rax
+
+ cmp $3, n
+ jle L(bc)
+
+ test $8, R8(rp)
+ jz L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+ movq (ap), %xmm0
+ movq 8(ap), %xmm1
+ psrlq %xmm4, %xmm0
+ psllq %xmm5, %xmm1
+ por %xmm1, %xmm0
+ movq %xmm0, (rp)
+ lea 8(ap), ap
+ lea 8(rp), rp
+ dec n
+
+L(rp_aligned):
+ lea 1(n), %r8d
+ lea (ap,n,8), ap
+ lea (rp,n,8), rp
+ neg n
+
+ and $6, R32(%r8)
+ jz L(bu0)
+ cmp $4, R32(%r8)
+ jz L(bu4)
+ jc L(bu2)
+L(bu6): add $4, n
+ jmp L(i56)
+L(bu0): add $6, n
+ jmp L(i70)
+L(bu4): add $2, n
+ jmp L(i34)
+L(bu2): add $8, n
+ jge L(end)
+
+ ALIGN(16)
+L(top): movdqu -64(ap,n,8), %xmm1
+ movdqu -56(ap,n,8), %xmm0
+ psllq %xmm5, %xmm0
+ psrlq %xmm4, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, -64(rp,n,8)
+L(i70):
+ movdqu -48(ap,n,8), %xmm1
+ movdqu -40(ap,n,8), %xmm0
+ psllq %xmm5, %xmm0
+ psrlq %xmm4, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, -48(rp,n,8)
+L(i56):
+ movdqu -32(ap,n,8), %xmm1
+ movdqu -24(ap,n,8), %xmm0
+ psllq %xmm5, %xmm0
+ psrlq %xmm4, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, -32(rp,n,8)
+L(i34):
+ movdqu -16(ap,n,8), %xmm1
+ movdqu -8(ap,n,8), %xmm0
+ psllq %xmm5, %xmm0
+ psrlq %xmm4, %xmm1
+ por %xmm1, %xmm0
+ movdqa %xmm0, -16(rp,n,8)
+ add $8, n
+ jl L(top)
+
+L(end): test $1, R8(n)
+ jnz L(e1)
+
+ movdqu -16(ap), %xmm1
+ movq -8(ap), %xmm0
+ psrlq %xmm4, %xmm1
+ psllq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movdqa %xmm0, -16(rp)
+ FUNC_EXIT()
+ ret
+
+L(e1): movq -8(ap), %xmm0
+ psrlq %xmm4, %xmm0
+ movq %xmm0, -8(rp)
+ FUNC_EXIT()
+ ret
+
+C Basecase
+ ALIGN(16)
+L(bc): dec R32(n)
+ jnz 1f
+ movq (ap), %xmm0
+ psrlq %xmm4, %xmm0
+ movq %xmm0, (rp)
+ FUNC_EXIT()
+ ret
+
+1: movq (ap), %xmm1
+ movq 8(ap), %xmm0
+ psrlq %xmm4, %xmm1
+ psllq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movq %xmm0, (rp)
+ dec R32(n)
+ jnz 1f
+ movq 8(ap), %xmm0
+ psrlq %xmm4, %xmm0
+ movq %xmm0, 8(rp)
+ FUNC_EXIT()
+ ret
+
+1: movq 8(ap), %xmm1
+ movq 16(ap), %xmm0
+ psrlq %xmm4, %xmm1
+ psllq %xmm5, %xmm0
+ por %xmm1, %xmm0
+ movq %xmm0, 8(rp)
+ movq 16(ap), %xmm0
+ psrlq %xmm4, %xmm0
+ movq %xmm0, 16(rp)
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm
new file mode 100644
index 0000000..e7b7feb
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm
@@ -0,0 +1,204 @@
+dnl AMD64 SSE mpn_sec_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb cycles/limb
+C ali,evn n unal,evn n other cases
+C AMD K8,K9 1.65 1.65 1.8
+C AMD K10 0.78 0.78 0.85
+C AMD bd1 0.80 0.91 1.25
+C AMD bobcat 2.15 2.15 2.37
+C Intel P4 2.5 2.5 2.95
+C Intel core2 1.17 1.25 1.25
+C Intel NHM 0.87 0.90 0.90
+C Intel SBR 0.63 0.79 0.77
+C Intel atom 4.3 4.3 4.3 slower than plain code
+C VIA nano 1.4 5.1 3.14 too alignment dependent
+
+C NOTES
+C * We only honour the least significant 32 bits of the `which' and `nents'
+C arguments to allow efficient code using just SSE2. We would need to
+C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
+C * We use movd for copying between xmm and plain registers, since old gas
+C rejects movq. But gas assembles movd as movq when given a 64-bit greg.
+
+define(`rp', `%rdi')
+define(`tp', `%rsi')
+define(`n', `%rdx')
+define(`nents', `%rcx')
+define(`which', `%r8')
+
+define(`i', `%r10')
+define(`j', `%r9')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C nents n rp tab which j i temp * * * *
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+
+IFDOS(` add $-88, %rsp ')
+IFDOS(` movdqu %xmm6, (%rsp) ')
+IFDOS(` movdqu %xmm7, 16(%rsp) ')
+IFDOS(` movdqu %xmm8, 32(%rsp) ')
+IFDOS(` movdqu %xmm9, 48(%rsp) ')
+
+ movd which, %xmm8
+ pshufd $0, %xmm8, %xmm8 C 4 `which' copies
+ mov $1, R32(%rax)
+ movd %rax, %xmm9
+ pshufd $0, %xmm9, %xmm9 C 4 copies of 1
+
+ mov n, j
+ add $-8, j
+ js L(outer_end)
+
+L(outer_top):
+ mov nents, i
+ mov tp, %r11
+ pxor %xmm1, %xmm1
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ ALIGN(16)
+L(top): movdqa %xmm8, %xmm0
+ pcmpeqd %xmm1, %xmm0
+ paddd %xmm9, %xmm1
+ movdqu 0(tp), %xmm2
+ movdqu 16(tp), %xmm3
+ pand %xmm0, %xmm2
+ pand %xmm0, %xmm3
+ por %xmm2, %xmm4
+ por %xmm3, %xmm5
+ movdqu 32(tp), %xmm2
+ movdqu 48(tp), %xmm3
+ pand %xmm0, %xmm2
+ pand %xmm0, %xmm3
+ por %xmm2, %xmm6
+ por %xmm3, %xmm7
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(top)
+
+ movdqu %xmm4, 0(rp)
+ movdqu %xmm5, 16(rp)
+ movdqu %xmm6, 32(rp)
+ movdqu %xmm7, 48(rp)
+
+ lea 64(%r11), tp
+ lea 64(rp), rp
+ add $-8, j
+ jns L(outer_top)
+L(outer_end):
+
+ test $4, R8(n)
+ je L(b0xx)
+L(b1xx):mov nents, i
+ mov tp, %r11
+ pxor %xmm1, %xmm1
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ ALIGN(16)
+L(tp4): movdqa %xmm8, %xmm0
+ pcmpeqd %xmm1, %xmm0
+ paddd %xmm9, %xmm1
+ movdqu 0(tp), %xmm2
+ movdqu 16(tp), %xmm3
+ pand %xmm0, %xmm2
+ pand %xmm0, %xmm3
+ por %xmm2, %xmm4
+ por %xmm3, %xmm5
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(tp4)
+ movdqu %xmm4, 0(rp)
+ movdqu %xmm5, 16(rp)
+ lea 32(%r11), tp
+ lea 32(rp), rp
+
+L(b0xx):test $2, R8(n)
+ je L(b00x)
+L(b01x):mov nents, i
+ mov tp, %r11
+ pxor %xmm1, %xmm1
+ pxor %xmm4, %xmm4
+ ALIGN(16)
+L(tp2): movdqa %xmm8, %xmm0
+ pcmpeqd %xmm1, %xmm0
+ paddd %xmm9, %xmm1
+ movdqu 0(tp), %xmm2
+ pand %xmm0, %xmm2
+ por %xmm2, %xmm4
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(tp2)
+ movdqu %xmm4, 0(rp)
+ lea 16(%r11), tp
+ lea 16(rp), rp
+
+L(b00x):test $1, R8(n)
+ je L(b000)
+L(b001):mov nents, i
+ mov tp, %r11
+ pxor %xmm1, %xmm1
+ pxor %xmm4, %xmm4
+ ALIGN(16)
+L(tp1): movdqa %xmm8, %xmm0
+ pcmpeqd %xmm1, %xmm0
+ paddd %xmm9, %xmm1
+ movq 0(tp), %xmm2
+ pand %xmm0, %xmm2
+ por %xmm2, %xmm4
+ lea (tp,n,8), tp
+ add $-1, i
+ jne L(tp1)
+ movq %xmm4, 0(rp)
+
+L(b000):
+IFDOS(` movdqu (%rsp), %xmm6 ')
+IFDOS(` movdqu 16(%rsp), %xmm7 ')
+IFDOS(` movdqu 32(%rsp), %xmm8 ')
+IFDOS(` movdqu 48(%rsp), %xmm9 ')
+IFDOS(` add $88, %rsp ')
+ FUNC_EXIT()
+ ret
+EPILOGUE()