diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 |
commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86_64/fastsse | |
parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86_64/fastsse')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/README | 22 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm | 311 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/com.asm | 175 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm | 254 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm | 166 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm | 300 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm | 185 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm | 182 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm | 173 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm | 193 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm | 183 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm | 201 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm | 204 |
13 files changed, 2549 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/README b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/README new file mode 100644 index 0000000..5538b2d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/README @@ -0,0 +1,22 @@ +This directory contains code for x86-64 processors with fast +implementations of SSE operations, hence the name "fastsse". + +Current processors that might benefit from this code are: + + AMD K10 + AMD Bulldozer/Piledriver/Steamroller/Excavator + Intel Nocona + Intel Nehalem/Westmere + Intel Sandybridge/Ivybridge + Intel Haswell/Broadwell + VIA Nano + +Current processors that do not benefit from this code are: + + AMD K8 + AMD Bobcat + Intel Atom + +Intel Conroe/Penryn is a border case; its handling of non-aligned +128-bit memory operands is poor. VIA Nano also have poor handling of +non-aligned operands. diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm new file mode 100644 index 0000000..69027bc --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com-palignr.asm @@ -0,0 +1,311 @@ +dnl AMD64 mpn_com optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bd1 1.39 ? 1.45 Y/N +C AMD bd2 0.8-1.4 0.7-1.4 Y +C AMD bd3 +C AMD bd4 +C AMD bobcat 1.97 ? 8.17 1.5/1.5 N +C AMD jaguar 1.02 1.02 0.91/0.91 N +C Intel P4 2.26 illop Y/N +C Intel core 0.58 0.87 opt/0.74 Y +C Intel NHM 0.64 1.14 opt/bad Y +C Intel SBR 0.51 0.65 opt/opt Y +C Intel IBR 0.50 0.64 opt/0.57 Y +C Intel HWL 0.51 0.58 opt/opt Y +C Intel BWL 0.52 0.64 opt/opt Y +C Intel SKL 0.51 0.63 opt/opt Y +C Intel atom 1.16 1.70 opt/opt Y +C Intel SLM 1.02 1.52 N +C VIA nano 1.09 1.10 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That +C instruction is better adapted to mpn_copyd's needs, we need to contort the +C code to use it here. +C +C For operands of < COM_SSE_THRESHOLD limbs, we use a plain 64-bit loop, taken +C from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +define(`movdqa', ``movaps'') + +ifdef(`COM_SSE_THRESHOLD',`',`define(`COM_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_com) + FUNC_ENTRY(3) + + cmp $COM_SSE_THRESHOLD, n + jbe L(bc) + + pcmpeqb %xmm5, %xmm5 C set to 111...111 + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(rp_aligned) C jump if rp aligned + + mov (up), %r8 + lea 8(up), up + not %r8 + mov %r8, (rp) + lea 8(rp), rp + dec n + +L(rp_aligned): + test $8, R8(up) + jnz L(uent) + +ifelse(eval(COM_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa 0(up), %xmm0 + movdqa 16(up), %xmm1 + movdqa 32(up), %xmm2 + movdqa 48(up), %xmm3 + lea 64(up), up + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + pxor %xmm5, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa (up), %xmm0 + movdqa 16(up), %xmm1 + lea 32(up), up + pxor %xmm5, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa (up), %xmm0 + lea 16(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent): +C Code handling up - rp = 8 (mod 16) + +C FIXME: The code below only handles overlap if it is close to complete, or +C quite separate: up-rp < 5 or up-up > 15 limbs + lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES + sub rp, %rax + cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES + jbe L(bc) C deflect to plain loop + + sub $16, n + jc L(uend) + + movdqa 120(up), %xmm3 + + sub $16, n + jmp L(um) + + ALIGN(16) +L(utop):movdqa 120(up), %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm0, -128(rp) + sub $16, n +L(um): movdqa 104(up), %xmm2 + palignr($8, %xmm2, %xmm3) + movdqa 88(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 112(rp) + palignr($8, %xmm1, %xmm2) + movdqa 72(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 96(rp) + palignr($8, %xmm0, %xmm1) + movdqa 56(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 80(rp) + palignr($8, %xmm3, %xmm0) + movdqa 40(up), %xmm2 + pxor %xmm5, %xmm0 + movdqa %xmm0, 64(rp) + palignr($8, %xmm2, %xmm3) + movdqa 24(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa 8(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 128(up), up + lea 128(rp), rp + jnc L(utop) + + pxor %xmm5, %xmm0 + movdqa %xmm0, -128(rp) + +L(uend):test $8, R8(n) + jz 1f + movdqa 56(up), %xmm3 + movdqa 40(up), %xmm2 + palignr($8, %xmm2, %xmm3) + movdqa 24(up), %xmm1 + pxor %xmm5, %xmm3 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa 8(up), %xmm0 + pxor %xmm5, %xmm2 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 64(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 64(rp), rp + +1: test $4, R8(n) + jz 1f + movdqa 24(up), %xmm1 + movdqa 8(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa -8(up), %xmm3 + pxor %xmm5, %xmm1 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 32(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa 8(up), %xmm0 + movdqa -8(up), %xmm3 + palignr($8, %xmm3, %xmm0) + lea 16(up), up + pxor %xmm5, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): lea -8(rp), rp + sub $4, R32(n) + jc L(end) + +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` ALIGN(16)') +L(top): mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + not %r8 + not %r9 + not %r10 + not %r11 + mov %r8, -24(rp) + mov %r9, -16(rp) +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, -8(rp) + mov %r11, (rp) +ifelse(eval(1 || COM_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + not %r8 + mov %r8, 8(rp) + lea 8(rp), rp + lea 8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov 8(up), %r9 + not %r8 + not %r9 + mov %r8, 8(rp) + mov %r9, 16(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com.asm new file mode 100644 index 0000000..c867222 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/com.asm @@ -0,0 +1,175 @@ +dnl AMD64 mpn_com optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 2.0 N +C AMD K10 0.85 1.3 Y/N +C AMD bull 1.40 1.40 Y +C AMD pile 0.9-1.4 0.9-1.4 Y +C AMD steam +C AMD excavator +C AMD bobcat 3.1 3.1 N +C AMD jaguar 0.91 0.91 opt/opt Y +C Intel P4 2.28 illop Y +C Intel core2 1.02 1.02 N +C Intel NHM 0.53 0.68 Y +C Intel SBR 0.51 0.75 opt/0.65 Y/N +C Intel IBR 0.50 0.57 opt/opt Y +C Intel HWL 0.51 0.64 opt/0.58 Y +C Intel BWL 0.61 0.65 0.57/opt Y +C Intel atom 3.68 3.68 N +C Intel SLM 1.09 1.35 N +C VIA nano 1.17 5.09 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_com) + FUNC_ENTRY(3) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + pcmpeqb %xmm7, %xmm7 C set to 111...111 + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + mov (up), %rax + lea 8(up), up + not %rax + mov %rax, (rp) + lea 8(rp), rp + dec n + + sub $14, n + jc L(sma) + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + movdqu 64(up), %xmm4 + movdqu 80(up), %xmm5 + movdqu 96(up), %xmm6 + lea 112(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + pxor %xmm7, %xmm4 + pxor %xmm7, %xmm5 + pxor %xmm7, %xmm6 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + movdqa %xmm4, 64(rp) + movdqa %xmm5, 80(rp) + movdqa %xmm6, 96(rp) + lea 112(rp), rp +L(ali): sub $14, n + jnc L(top) + +L(sma): add $14, n + test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + lea 64(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + pxor %xmm7, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + lea 32(up), up + pxor %xmm7, %xmm0 + pxor %xmm7, %xmm1 + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea 16(up), up + pxor %xmm7, %xmm0 + movdqa %xmm0, (rp) + lea 16(rp), rp +1: + test $1, R8(n) + jz 1f + mov (up), %rax + not %rax + mov %rax, (rp) +1: +L(don): +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm new file mode 100644 index 0000000..fac6f8a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd-palignr.asm @@ -0,0 +1,254 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bull 0.70 0.70 Y +C AMD pile 0.68 0.68 Y +C AMD steam +C AMD excavator +C AMD bobcat 1.97 8.24 1.5/1.5 N +C AMD jaguar 0.77 0.89 0.65/opt N/Y +C Intel P4 2.26 illop Y/N +C Intel core 0.52 0.80 opt/opt Y +C Intel NHM 0.52 0.64 opt/opt Y +C Intel SBR 0.51 0.51 opt/opt Y +C Intel IBR 0.50 0.50 opt/opt Y +C Intel HWL 0.50 0.51 opt/opt Y +C Intel BWL 0.55 0.55 opt/opt Y +C Intel atom 1.16 1.66 opt/opt Y +C Intel SLM 1.02 1.04 opt/opt Y +C VIA nano 1.08 1.06 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). +C +C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop, +C taken from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +define(`movdqa', ``movaps'') + +ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + lea -8(up,n,8), up + lea -8(rp,n,8), rp + + cmp $COPYD_SSE_THRESHOLD, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jnz L(rp_aligned) C jump if rp aligned + + mov (up), %rax C copy one limb + mov %rax, (rp) + lea -8(up), up + lea -8(rp), rp + dec n + +L(rp_aligned): + test $8, R8(up) + jz L(uent) + +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa -8(up), %xmm0 + movdqa -24(up), %xmm1 + movdqa -40(up), %xmm2 + movdqa -56(up), %xmm3 + lea -64(up), up + movdqa %xmm0, -8(rp) + movdqa %xmm1, -24(rp) + movdqa %xmm2, -40(rp) + movdqa %xmm3, -56(rp) + lea -64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa -8(up), %xmm0 + movdqa -24(up), %xmm1 + lea -32(up), up + movdqa %xmm0, -8(rp) + movdqa %xmm1, -24(rp) + lea -32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa -8(up), %xmm0 + lea -16(up), up + movdqa %xmm0, -8(rp) + lea -16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent):sub $16, n + movdqa (up), %xmm0 + jc L(uend) + + ALIGN(16) +L(utop):sub $16, n + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm2 + palignr($8, %xmm2, %xmm1) + movdqa %xmm1, -24(rp) + movdqa -48(up), %xmm3 + palignr($8, %xmm3, %xmm2) + movdqa %xmm2, -40(rp) + movdqa -64(up), %xmm0 + palignr($8, %xmm0, %xmm3) + movdqa %xmm3, -56(rp) + movdqa -80(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -72(rp) + movdqa -96(up), %xmm2 + palignr($8, %xmm2, %xmm1) + movdqa %xmm1, -88(rp) + movdqa -112(up), %xmm3 + palignr($8, %xmm3, %xmm2) + movdqa %xmm2, -104(rp) + movdqa -128(up), %xmm0 + palignr($8, %xmm0, %xmm3) + movdqa %xmm3, -120(rp) + lea -128(up), up + lea -128(rp), rp + jnc L(utop) + +L(uend):test $8, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -24(rp) + movdqa -48(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -40(rp) + movdqa -64(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -56(rp) + lea -64(up), up + lea -64(rp), rp + +1: test $4, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + movdqa -32(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, -24(rp) + lea -32(up), up + lea -32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa -16(up), %xmm1 + palignr($8, %xmm1, %xmm0) + movdqa %xmm0, -8(rp) + lea -16(up), up + lea -16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): sub $4, R32(n) + jc L(end) + + ALIGN(16) +L(top): mov (up), %r8 + mov -8(up), %r9 + lea -32(rp), rp + mov -16(up), %r10 + mov -24(up), %r11 + lea -32(up), up + mov %r8, 32(rp) + mov %r9, 24(rp) +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, 16(rp) + mov %r11, 8(rp) +ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + lea -8(rp), rp + lea -8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov -8(up), %r9 + mov %r8, (rp) + mov %r9, -8(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm new file mode 100644 index 0000000..b3c4706 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyd.asm @@ -0,0 +1,166 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 +C AMD K10 0.85 1.64 Y/N +C AMD bull 1.4 1.4 Y +C AMD pile 0.68 0.98 Y/N +C AMD steam +C AMD excavator +C AMD bobcat +C AMD jaguar 0.65 1.02 opt/0.93 Y/N +C Intel P4 2.3 2.3 Y +C Intel core 1.0 1.0 0.52/0.80 N +C Intel NHM 0.5 0.67 Y +C Intel SBR 0.51 0.75 opt/0.54 Y/N +C Intel IBR 0.50 0.57 opt/0.50 Y +C Intel HWL 0.50 0.57 opt/0.51 Y +C Intel BWL 0.55 0.62 opt/0.55 Y +C Intel atom +C Intel SLM 1.02 1.27 opt/1.04 Y/N +C VIA nano 1.16 5.16 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`movdqu', lddqu) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + test n, n + jz L(don) + + lea -16(rp,n,8), rp + lea -16(up,n,8), up + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + mov 8(up), %rax + lea -8(up), up + mov %rax, 8(rp) + lea -8(rp), rp + dec n + +L(ali): sub $16, n + jc L(sma) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + movdqu -32(up), %xmm2 + movdqu -48(up), %xmm3 + movdqu -64(up), %xmm4 + movdqu -80(up), %xmm5 + movdqu -96(up), %xmm6 + movdqu -112(up), %xmm7 + lea -128(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + movdqa %xmm2, -32(rp) + movdqa %xmm3, -48(rp) + movdqa %xmm4, -64(rp) + movdqa %xmm5, -80(rp) + movdqa %xmm6, -96(rp) + movdqa %xmm7, -112(rp) + lea -128(rp), rp + sub $16, n + jnc L(top) + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + +L(sma): test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + movdqu -32(up), %xmm2 + movdqu -48(up), %xmm3 + lea -64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + movdqa %xmm2, -32(rp) + movdqa %xmm3, -48(rp) + lea -64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu -16(up), %xmm1 + lea -32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, -16(rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea -16(up), up + movdqa %xmm0, (rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 8(up), %r8 + mov %r8, 8(rp) +1: +L(don): FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm new file mode 100644 index 0000000..9876a47 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm @@ -0,0 +1,300 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 2.0 illop 1.0/1.0 N +C AMD K10 0.85 illop Y/N +C AMD bd1 0.70 0.66 Y +C AMD bd2 0.68 0.66 Y +C AMD bd3 ? ? +C AMD bd4 ? ? +C AMD bt1 1.97 8.16 1.5/1.5 N +C AMD bt2 0.77 0.93 0.65/opt N/Y +C AMD zn1 ? ? +C AMD zn2 ? ? +C Intel P4 2.26 illop Y/N +C Intel CNR 0.52 0.64 opt/opt Y +C Intel NHM 0.52 0.71 0.50/0.67 N +C Intel SBR 0.51 0.54 opt/0.51 Y +C Intel IBR 0.50 0.54 opt/opt Y +C Intel HWL 0.50 0.51 opt/opt Y +C Intel BWL 0.55 0.55 opt/opt Y +C Intel atom 1.16 1.61 opt/opt Y +C Intel SLM 1.02 1.07 opt/opt Y +C VIA nano 1.09 1.08 opt/opt Y + +C We use only 16-byte operations, except for unaligned top-most and bottom-most +C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That +C instruction is better adapted to mpn_copyd's needs, we need to contort the +C code to use it here. +C +C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop, +C taken from the x86_64 default code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +C There are three instructions for loading an aligned 128-bit quantity. We use +C movaps, since it has the shortest coding. +dnl define(`movdqa', ``movaps'') + +ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $COPYI_SSE_THRESHOLD, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(rp_aligned) C jump if rp aligned + + movsq C copy one limb + dec n + +L(rp_aligned): + test $8, R8(up) + jnz L(uent) + +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` sub $8, n', +` jmp L(am)') + + ALIGN(16) +L(atop):movdqa 0(up), %xmm0 + movdqa 16(up), %xmm1 + movdqa 32(up), %xmm2 + movdqa 48(up), %xmm3 + lea 64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +L(am): sub $8, n + jnc L(atop) + + test $4, R8(n) + jz 1f + movdqa (up), %xmm0 + movdqa 16(up), %xmm1 + lea 32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa (up), %xmm0 + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +L(uent): +C Code handling up - rp = 8 (mod 16) + + cmp $16, n + jc L(ued0) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') +IFDOS(` movdqa %xmm8, 32(%rsp) ') + + movaps 120(up), %xmm7 + movaps 104(up), %xmm6 + movaps 88(up), %xmm5 + movaps 72(up), %xmm4 + movaps 56(up), %xmm3 + movaps 40(up), %xmm2 + lea 128(up), up + sub $32, n + jc L(ued1) + + ALIGN(16) +L(utop):movaps -104(up), %xmm1 + sub $16, n + movaps -120(up), %xmm0 + palignr($8, %xmm6, %xmm7) + movaps -136(up), %xmm8 + movdqa %xmm7, 112(rp) + palignr($8, %xmm5, %xmm6) + movaps 120(up), %xmm7 + movdqa %xmm6, 96(rp) + palignr($8, %xmm4, %xmm5) + movaps 104(up), %xmm6 + movdqa %xmm5, 80(rp) + palignr($8, %xmm3, %xmm4) + movaps 88(up), %xmm5 + movdqa %xmm4, 64(rp) + palignr($8, %xmm2, %xmm3) + movaps 72(up), %xmm4 + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movaps 56(up), %xmm3 + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movaps 40(up), %xmm2 + movdqa %xmm1, 16(rp) + palignr($8, %xmm8, %xmm0) + lea 128(up), up + movdqa %xmm0, (rp) + lea 128(rp), rp + jnc L(utop) + +L(ued1):movaps -104(up), %xmm1 + movaps -120(up), %xmm0 + movaps -136(up), %xmm8 + palignr($8, %xmm6, %xmm7) + movdqa %xmm7, 112(rp) + palignr($8, %xmm5, %xmm6) + movdqa %xmm6, 96(rp) + palignr($8, %xmm4, %xmm5) + movdqa %xmm5, 80(rp) + palignr($8, %xmm3, %xmm4) + movdqa %xmm4, 64(rp) + palignr($8, %xmm2, %xmm3) + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, 16(rp) + palignr($8, %xmm8, %xmm0) + movdqa %xmm0, (rp) + lea 128(rp), rp + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` movdqa 32(%rsp), %xmm8 ') +IFDOS(` add $56, %rsp ') + +L(ued0):test $8, R8(n) + jz 1f + movaps 56(up), %xmm3 + movaps 40(up), %xmm2 + movaps 24(up), %xmm1 + movaps 8(up), %xmm0 + movaps -8(up), %xmm4 + palignr($8, %xmm2, %xmm3) + movdqa %xmm3, 48(rp) + palignr($8, %xmm1, %xmm2) + movdqa %xmm2, 32(rp) + palignr($8, %xmm0, %xmm1) + movdqa %xmm1, 16(rp) + palignr($8, %xmm4, %xmm0) + lea 64(up), up + movdqa %xmm0, (rp) + lea 64(rp), rp + +1: test $4, R8(n) + jz 1f + movaps 24(up), %xmm1 + movaps 8(up), %xmm0 + palignr($8, %xmm0, %xmm1) + movaps -8(up), %xmm3 + movdqa %xmm1, 16(rp) + palignr($8, %xmm3, %xmm0) + lea 32(up), up + movdqa %xmm0, (rp) + lea 32(rp), rp + +1: test $2, R8(n) + jz 1f + movdqa 8(up), %xmm0 + movdqa -8(up), %xmm3 + palignr($8, %xmm3, %xmm0) + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + +1: test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) + +1: FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for +C correctness as the above code is currently written. + +L(bc): lea -8(rp), rp + sub $4, R32(n) + jc L(end) + + ALIGN(16) +L(top): mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp + mov 16(up), %r10 + mov 24(up), %r11 + lea 32(up), up + mov %r8, -24(rp) + mov %r9, -16(rp) +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` sub $4, R32(n)') + mov %r10, -8(rp) + mov %r11, (rp) +ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, +` jnc L(top)') + +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, 8(rp) + lea 8(rp), rp + lea 8(up), up +1: test $2, R8(n) + jz 1f + mov (up), %r8 + mov 8(up), %r9 + mov %r8, 8(rp) + mov %r9, 16(rp) +1: FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm new file mode 100644 index 0000000..97f7865 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/copyi.asm @@ -0,0 +1,185 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast SSE. + +dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, +dnl Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 +C AMD K10 0.85 1.64 Y/N +C AMD bull 1.4 1.4 N +C AMD pile 0.77 0.93 N +C AMD steam ? ? +C AMD excavator ? ? +C AMD bobcat +C AMD jaguar 0.65 1.02 opt/0.93 Y/N +C Intel P4 2.3 2.3 Y +C Intel core 1.0 1.0 0.52/0.64 N +C Intel NHM 0.5 0.67 Y +C Intel SBR 0.51 0.75 opt/0.54 Y/N +C Intel IBR 0.50 0.57 opt/0.54 Y +C Intel HWL 0.50 0.57 opt/0.51 Y +C Intel BWL 0.55 0.62 opt/0.55 Y +C Intel atom +C Intel SLM 1.02 1.27 opt/1.07 Y/N +C VIA nano 1.16 5.16 Y/N + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We can always write using +C aligned 16-byte operations, we read with both aligned and unaligned 16-byte +C operations. + +C Instead of having separate loops for reading aligned and unaligned, we read +C using MOVDQU. This seems to work great except for core2; there performance +C doubles when reading using MOVDQA (for aligned source). It is unclear how to +C best handle the unaligned case there. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`movdqu', lddqu) + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $3, n C NB: bc code below assumes this limit + jc L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(ali) C jump if rp aligned + movsq C copy single limb + dec n + +L(ali): sub $16, n + jc L(sma) + +IFDOS(` add $-56, %rsp ') +IFDOS(` movdqa %xmm6, (%rsp) ') +IFDOS(` movdqa %xmm7, 16(%rsp) ') + + ALIGN(16) +L(top): movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + movdqu 64(up), %xmm4 + movdqu 80(up), %xmm5 + movdqu 96(up), %xmm6 + movdqu 112(up), %xmm7 + lea 128(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + movdqa %xmm4, 64(rp) + movdqa %xmm5, 80(rp) + movdqa %xmm6, 96(rp) + movdqa %xmm7, 112(rp) + lea 128(rp), rp + sub $16, n + jnc L(top) + +IFDOS(` movdqa (%rsp), %xmm6 ') +IFDOS(` movdqa 16(%rsp), %xmm7 ') +IFDOS(` add $56, %rsp ') + +L(sma): test $8, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + movdqu 32(up), %xmm2 + movdqu 48(up), %xmm3 + lea 64(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + movdqa %xmm2, 32(rp) + movdqa %xmm3, 48(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + movdqu (up), %xmm0 + movdqu 16(up), %xmm1 + lea 32(up), up + movdqa %xmm0, (rp) + movdqa %xmm1, 16(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + movdqu (up), %xmm0 + lea 16(up), up + movdqa %xmm0, (rp) + lea 16(rp), rp + ALIGN(16) +1: +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) +1: + FUNC_EXIT() + ret + +C Basecase code. Needed for good small operands speed, not for correctness as +C the above code is currently written. The commented-out lines need to be +C reinstated if this code is to be used for n > 3, and then the post loop +C offsets need fixing. + +L(bc): sub $2, n + jc L(end) + ALIGN(16) +1: mov (up), %rax + mov 8(up), %rcx +dnl lea 16(up), up + mov %rax, (rp) + mov %rcx, 8(rp) +dnl lea 16(rp), rp +dnl sub $2, n +dnl jnc 1b + + test $1, R8(n) + jz L(ret) + mov 16(up), %rax + mov %rax, 16(rp) +L(ret): FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm new file mode 100644 index 0000000..a05e850 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift-movdqu2.asm @@ -0,0 +1,182 @@ +dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 2.35 no, use shl/shr +C AMD K10 1.5-1.8 1.5-1.8 1.33 yes +C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes +C AMD bobcat 3.17 3.17 yes, bad for n < 20 +C Intel P4 4.67 4.67 2.7 no, slow movdqu +C Intel core2 2.15 2.15 1.25 no, use shld/shrd +C Intel NHM 1.66 1.66 1.25 no, use shld/shrd +C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 +C Intel atom 11.7 11.7 4.5 no +C VIA nano 5.7 5.95 2.0 no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + cmp $3, n + jle L(bc) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea 1(n), %r8d + + and $6, R32(%r8) + jz L(ba0) + cmp $4, R32(%r8) + jz L(ba4) + jc L(ba2) +L(ba6): add $-4, n + jmp L(i56) +L(ba0): add $-6, n + jmp L(i70) +L(ba4): add $-2, n + jmp L(i34) +L(ba2): add $-8, n + jle L(end) + + ALIGN(16) +L(top): movdqu 40(ap,n,8), %xmm1 + movdqu 48(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 48(rp,n,8) +L(i70): + movdqu 24(ap,n,8), %xmm1 + movdqu 32(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 32(rp,n,8) +L(i56): + movdqu 8(ap,n,8), %xmm1 + movdqu 16(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, 16(rp,n,8) +L(i34): + movdqu -8(ap,n,8), %xmm1 + movdqu (ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) + sub $8, n + jg L(top) + +L(end): test $1, R8(n) + jnz L(end8) + + movdqu (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jz L(end8) + + movq (ap,n,8), %xmm1 + movq -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (rp,n,8) + sub $2, R32(n) + jl L(end8) + movq 8(ap), %xmm1 + movq (ap), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm new file mode 100644 index 0000000..6a17b93 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm @@ -0,0 +1,173 @@ +dnl AMD64 mpn_lshift optimised for CPUs with fast SSE. + +dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. + +dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb good +C 16-byte aligned 16-byte unaligned for cpu? +C AMD K8,K9 ? ? +C AMD K10 1.68 (1.45) 1.75 (1.49) Y +C AMD bd1 1.82 (1.75) 1.82 (1.75) Y +C AMD bobcat 4 4 +C Intel P4 3 (2.7) 3 (2.7) Y +C Intel core2 2.05 (1.67) 2.55 (1.75) +C Intel NHM 2.05 (1.75) 2.09 (2) +C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y +C Intel atom ? ? +C VIA nano 2.25 (2) 2.5 (2) Y + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. + +C There are two inner-loops, one for when rp = ap (mod 16) and one when this is +C not true. The aligned case reads 16+8 bytes, the unaligned case reads +C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. + +C This is not yet great code: +C (1) The unaligned case makes many reads. +C (2) We should do some unrolling, at least 2-way. +C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on +C Nano. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + cmp $2, n + jle L(le2) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea (ap,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(aent) + jmp L(uent) +C ***************************************************************************** + +C Handle the case when ap != rp (mod 16). + + ALIGN(16) +L(utop):movdqa -8(ap,n,8), %xmm0 + movq (ap,n,8), %xmm1 + punpcklqdq 8(ap,n,8), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) +L(uent):sub $2, n + ja L(utop) + + jne L(end8) + + movq (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + punpcklqdq 8(ap), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + +C Handle the case when ap = rp (mod 16). + + ALIGN(16) +L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] + movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] + punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) +L(aent): + sub $2, n + ja L(atop) + jne L(end8) + + movdqa (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + + ALIGN(16) +L(le2): jne L(end8) + + movq 8(ap), %xmm0 + movq (ap), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm new file mode 100644 index 0000000..8250910 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc-movdqu2.asm @@ -0,0 +1,193 @@ +dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 ? no, use shl/shr +C AMD K10 1.8-2.0 1.8-2.0 ? yes +C AMD bd1 1.9 1.9 ? yes +C AMD bobcat 3.67 3.67 yes, bad for n < 20 +C Intel P4 4.75 4.75 ? no, slow movdqu +C Intel core2 2.27 2.27 ? no, use shld/shrd +C Intel NHM 2.15 2.15 ? no, use shld/shrd +C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6 +C Intel atom 12.9 12.9 ? no +C VIA nano 6.18 6.44 ? no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + pcmpeqb %xmm3, %xmm3 C set to 111...111 + + cmp $3, n + jle L(bc) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea 1(n), %r8d + + and $6, R32(%r8) + jz L(ba0) + cmp $4, R32(%r8) + jz L(ba4) + jc L(ba2) +L(ba6): add $-4, n + jmp L(i56) +L(ba0): add $-6, n + jmp L(i70) +L(ba4): add $-2, n + jmp L(i34) +L(ba2): add $-8, n + jle L(end) + + ALIGN(16) +L(top): movdqu 40(ap,n,8), %xmm1 + movdqu 48(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 48(rp,n,8) +L(i70): + movdqu 24(ap,n,8), %xmm1 + movdqu 32(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 32(rp,n,8) +L(i56): + movdqu 8(ap,n,8), %xmm1 + movdqu 16(ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, 16(rp,n,8) +L(i34): + movdqu -8(ap,n,8), %xmm1 + movdqu (ap,n,8), %xmm0 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, (rp,n,8) + sub $8, n + jg L(top) + +L(end): test $1, R8(n) + jnz L(end8) + + movdqu (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jz L(end8) + + movq (ap,n,8), %xmm1 + movq -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, (rp,n,8) + sub $2, R32(n) + jl L(end8) + movq 8(ap), %xmm1 + movq (ap), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + pxor %xmm3, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm new file mode 100644 index 0000000..a616075 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/lshiftc.asm @@ -0,0 +1,183 @@ +dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE. + +dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. + +dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb good +C 16-byte aligned 16-byte unaligned for cpu? +C AMD K8,K9 ? ? +C AMD K10 1.85 (1.635) 1.9 (1.67) Y +C AMD bd1 1.82 (1.75) 1.82 (1.75) Y +C AMD bobcat 4.5 4.5 +C Intel P4 3.6 (3.125) 3.6 (3.125) Y +C Intel core2 2.05 (1.67) 2.55 (1.75) +C Intel NHM 2.05 (1.875) 2.6 (2.25) +C Intel SBR 1.55 (1.44) 2 (1.57) Y +C Intel atom ? ? +C VIA nano 2.5 (2.5) 2.5 (2.5) Y + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. We always write using +C 16-byte operations, we read with both 8-byte and 16-byte operations. + +C There are two inner-loops, one for when rp = ap (mod 16) and one when this is +C not true. The aligned case reads 16+8 bytes, the unaligned case reads +C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. + +C This is not yet great code: +C (1) The unaligned case makes too many reads. +C (2) We should do some unrolling, at least 2-way. +C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on +C Nano. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + pcmpeqb %xmm2, %xmm2 C set to 111...111 + + cmp $2, n + jle L(le2) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea (ap,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(aent) + jmp L(uent) +C ***************************************************************************** + +C Handle the case when ap != rp (mod 16). + + ALIGN(16) +L(utop):movq (ap,n,8), %xmm1 + punpcklqdq 8(ap,n,8), %xmm1 + movdqa -8(ap,n,8), %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp,n,8) +L(uent):sub $2, n + ja L(utop) + + jne L(end8) + + movq (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + punpcklqdq 8(ap), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + +C Handle the case when ap = rp (mod 16). + + ALIGN(16) +L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] + movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] + punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp,n,8) +L(aent):sub $2, n + ja L(atop) + + jne L(end8) + + movdqa (ap), %xmm0 + pxor %xmm1, %xmm1 + punpcklqdq %xmm0, %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + + ALIGN(16) +L(le2): jne L(end8) + + movq 8(ap), %xmm0 + movq (ap), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + pxor %xmm2, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm new file mode 100644 index 0000000..1e270b1 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/rshift-movdqu2.asm @@ -0,0 +1,201 @@ +dnl AMD64 mpn_rshift optimised for CPUs with fast SSE including fast movdqu. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 3 3 2.35 no, use shl/shr +C AMD K10 1.5-1.8 1.5-1.8 1.33 yes +C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes +C AMD bobcat 3.17 3.17 yes, bad for n < 20 +C Intel P4 4.67 4.67 2.7 no, slow movdqu +C Intel core2 2.15 2.15 1.25 no, use shld/shrd +C Intel NHM 1.66 1.66 1.25 no, use shld/shrd +C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 +C Intel atom 11.7 11.7 4.5 no +C VIA nano 5.7 5.95 2.0 no, slow movdqu + +C We try to do as many aligned 16-byte operations as possible. The top-most +C and bottom-most writes might need 8-byte operations. +C +C This variant rely on fast load movdqu, and uses it even for aligned operands, +C in order to avoid the need for two separate loops. +C +C TODO +C * Could 2-limb wind-down code be simplified? +C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts +C for other affected CPUs. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov (ap), %rax + shl R8(%rcx), %rax + + cmp $3, n + jle L(bc) + + test $8, R8(rp) + jz L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq (ap), %xmm0 + movq 8(ap), %xmm1 + psrlq %xmm4, %xmm0 + psllq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, (rp) + lea 8(ap), ap + lea 8(rp), rp + dec n + +L(rp_aligned): + lea 1(n), %r8d + lea (ap,n,8), ap + lea (rp,n,8), rp + neg n + + and $6, R32(%r8) + jz L(bu0) + cmp $4, R32(%r8) + jz L(bu4) + jc L(bu2) +L(bu6): add $4, n + jmp L(i56) +L(bu0): add $6, n + jmp L(i70) +L(bu4): add $2, n + jmp L(i34) +L(bu2): add $8, n + jge L(end) + + ALIGN(16) +L(top): movdqu -64(ap,n,8), %xmm1 + movdqu -56(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -64(rp,n,8) +L(i70): + movdqu -48(ap,n,8), %xmm1 + movdqu -40(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -48(rp,n,8) +L(i56): + movdqu -32(ap,n,8), %xmm1 + movdqu -24(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -32(rp,n,8) +L(i34): + movdqu -16(ap,n,8), %xmm1 + movdqu -8(ap,n,8), %xmm0 + psllq %xmm5, %xmm0 + psrlq %xmm4, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, -16(rp,n,8) + add $8, n + jl L(top) + +L(end): test $1, R8(n) + jnz L(e1) + + movdqu -16(ap), %xmm1 + movq -8(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, -16(rp) + FUNC_EXIT() + ret + +L(e1): movq -8(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, -8(rp) + FUNC_EXIT() + ret + +C Basecase + ALIGN(16) +L(bc): dec R32(n) + jnz 1f + movq (ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret + +1: movq (ap), %xmm1 + movq 8(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, (rp) + dec R32(n) + jnz 1f + movq 8(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 8(rp) + FUNC_EXIT() + ret + +1: movq 8(ap), %xmm1 + movq 16(ap), %xmm0 + psrlq %xmm4, %xmm1 + psllq %xmm5, %xmm0 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + movq 16(ap), %xmm0 + psrlq %xmm4, %xmm0 + movq %xmm0, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm new file mode 100644 index 0000000..e7b7feb --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm @@ -0,0 +1,204 @@ +dnl AMD64 SSE mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb cycles/limb +C ali,evn n unal,evn n other cases +C AMD K8,K9 1.65 1.65 1.8 +C AMD K10 0.78 0.78 0.85 +C AMD bd1 0.80 0.91 1.25 +C AMD bobcat 2.15 2.15 2.37 +C Intel P4 2.5 2.5 2.95 +C Intel core2 1.17 1.25 1.25 +C Intel NHM 0.87 0.90 0.90 +C Intel SBR 0.63 0.79 0.77 +C Intel atom 4.3 4.3 4.3 slower than plain code +C VIA nano 1.4 5.1 3.14 too alignment dependent + +C NOTES +C * We only honour the least significant 32 bits of the `which' and `nents' +C arguments to allow efficient code using just SSE2. We would need to +C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence. +C * We use movd for copying between xmm and plain registers, since old gas +C rejects movq. But gas assembles movd as movq when given a 64-bit greg. + +define(`rp', `%rdi') +define(`tp', `%rsi') +define(`n', `%rdx') +define(`nents', `%rcx') +define(`which', `%r8') + +define(`i', `%r10') +define(`j', `%r9') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 +C nents n rp tab which j i temp * * * * + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sec_tabselect) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + +IFDOS(` add $-88, %rsp ') +IFDOS(` movdqu %xmm6, (%rsp) ') +IFDOS(` movdqu %xmm7, 16(%rsp) ') +IFDOS(` movdqu %xmm8, 32(%rsp) ') +IFDOS(` movdqu %xmm9, 48(%rsp) ') + + movd which, %xmm8 + pshufd $0, %xmm8, %xmm8 C 4 `which' copies + mov $1, R32(%rax) + movd %rax, %xmm9 + pshufd $0, %xmm9, %xmm9 C 4 copies of 1 + + mov n, j + add $-8, j + js L(outer_end) + +L(outer_top): + mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + ALIGN(16) +L(top): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + movdqu 16(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm4 + por %xmm3, %xmm5 + movdqu 32(tp), %xmm2 + movdqu 48(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm6 + por %xmm3, %xmm7 + lea (tp,n,8), tp + add $-1, i + jne L(top) + + movdqu %xmm4, 0(rp) + movdqu %xmm5, 16(rp) + movdqu %xmm6, 32(rp) + movdqu %xmm7, 48(rp) + + lea 64(%r11), tp + lea 64(rp), rp + add $-8, j + jns L(outer_top) +L(outer_end): + + test $4, R8(n) + je L(b0xx) +L(b1xx):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + ALIGN(16) +L(tp4): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + movdqu 16(tp), %xmm3 + pand %xmm0, %xmm2 + pand %xmm0, %xmm3 + por %xmm2, %xmm4 + por %xmm3, %xmm5 + lea (tp,n,8), tp + add $-1, i + jne L(tp4) + movdqu %xmm4, 0(rp) + movdqu %xmm5, 16(rp) + lea 32(%r11), tp + lea 32(rp), rp + +L(b0xx):test $2, R8(n) + je L(b00x) +L(b01x):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + ALIGN(16) +L(tp2): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movdqu 0(tp), %xmm2 + pand %xmm0, %xmm2 + por %xmm2, %xmm4 + lea (tp,n,8), tp + add $-1, i + jne L(tp2) + movdqu %xmm4, 0(rp) + lea 16(%r11), tp + lea 16(rp), rp + +L(b00x):test $1, R8(n) + je L(b000) +L(b001):mov nents, i + mov tp, %r11 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + ALIGN(16) +L(tp1): movdqa %xmm8, %xmm0 + pcmpeqd %xmm1, %xmm0 + paddd %xmm9, %xmm1 + movq 0(tp), %xmm2 + pand %xmm0, %xmm2 + por %xmm2, %xmm4 + lea (tp,n,8), tp + add $-1, i + jne L(tp1) + movq %xmm4, 0(rp) + +L(b000): +IFDOS(` movdqu (%rsp), %xmm6 ') +IFDOS(` movdqu 16(%rsp), %xmm7 ') +IFDOS(` movdqu 32(%rsp), %xmm8 ') +IFDOS(` movdqu 48(%rsp), %xmm9 ') +IFDOS(` add $88, %rsp ') + FUNC_EXIT() + ret +EPILOGUE() |