diff options
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86_64/fastavx')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm | 181 | ||||
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm | 178 |
2 files changed, 359 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm new file mode 100644 index 0000000..21ab210 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm @@ -0,0 +1,181 @@ +dnl AMD64 mpn_copyd optimised for CPUs with fast AVX. + +dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 4.87 4.87 N +C AMD bd3 ? ? +C AMD bd4 0.53 ? +C AMD zn1 0.51 ? +C AMD zn2 0.25 ? Y +C AMD zn3 0.25 ? Y +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel WSM n/a +C Intel SBR 0.50 0.91 N +C Intel IBR 0.50 0.65 N +C Intel HWL 0.25 0.30 Y +C Intel BWL 0.28 0.37 Y +C Intel SKL 0.27 ? Y +C Intel atom n/a +C Intel SLM n/a +C Intel GLM n/a +C VIA nano n/a + +C We try to do as many 32-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. For the bulk copying, we +C write using aligned 32-byte operations, but we read with both aligned and +C unaligned 32-byte operations. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`vmovdqu', vlddqu) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_copyd) + FUNC_ENTRY(3) + + lea -32(rp,n,8), rp + lea -32(up,n,8), up + + cmp $7, n C basecase needed for correctness + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(a2) C jump if rp aligned + mov 24(up), %rax + lea -8(up), up + mov %rax, 24(rp) + lea -8(rp), rp + dec n +L(a2): test $16, R8(rp) C is rp 32-byte aligned? + jz L(a3) C jump if rp aligned + vmovdqu 16(up), %xmm0 + lea -16(up), up + vmovdqa %xmm0, 16(rp) + lea -16(rp), rp + sub $2, n +L(a3): sub $16, n + jc L(sma) + + ALIGN(16) +L(top): vmovdqu (up), %ymm0 + vmovdqu -32(up), %ymm1 + vmovdqu -64(up), %ymm2 + vmovdqu -96(up), %ymm3 + lea -128(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, -32(rp) + vmovdqa %ymm2, -64(rp) + vmovdqa %ymm3, -96(rp) + lea -128(rp), rp +L(ali): sub $16, n + jnc L(top) + +L(sma): test $8, R8(n) + jz 1f + vmovdqu (up), %ymm0 + vmovdqu -32(up), %ymm1 + lea -64(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, -32(rp) + lea -64(rp), rp +1: + test $4, R8(n) + jz 1f + vmovdqu (up), %ymm0 + lea -32(up), up + vmovdqa %ymm0, (rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + vmovdqu 16(up), %xmm0 + lea -16(up), up + vmovdqa %xmm0, 16(rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 24(up), %r8 + mov %r8, 24(rp) +1: + FUNC_EXIT() + ret + + ALIGN(16) +L(bc): test $4, R8(n) + jz 1f + mov 24(up), %rax + mov 16(up), %rcx + mov 8(up), %r8 + mov (up), %r9 + lea -32(up), up + mov %rax, 24(rp) + mov %rcx, 16(rp) + mov %r8, 8(rp) + mov %r9, (rp) + lea -32(rp), rp +1: + test $2, R8(n) + jz 1f + mov 24(up), %rax + mov 16(up), %rcx + lea -16(up), up + mov %rax, 24(rp) + mov %rcx, 16(rp) + lea -16(rp), rp +1: + test $1, R8(n) + jz 1f + mov 24(up), %rax + mov %rax, 24(rp) +1: + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm new file mode 100644 index 0000000..03c2440 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm @@ -0,0 +1,178 @@ +dnl AMD64 mpn_copyi optimised for CPUs with fast AVX. + +dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 n/a +C AMD bd2 4.87 4.87 N +C AMD bd3 ? ? +C AMD bd4 0.53 ? +C AMD zn1 0.51 ? +C AMD zn2 0.25 ? Y +C AMD zn3 0.25 ? Y +C AMD bt1 n/a +C AMD bt2 n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel WSM n/a +C Intel SBR 0.50 0.91 N +C Intel IBR 0.50 0.65 N +C Intel HWL 0.25 0.30 Y +C Intel BWL 0.28 0.37 Y +C Intel SKL 0.27 ? Y +C Intel atom n/a +C Intel SLM n/a +C Intel GLM n/a +C VIA nano n/a + +C We try to do as many 32-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. For the bulk copying, we +C write using aligned 32-byte operations, but we read with both aligned and +C unaligned 32-byte operations. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +dnl define(`vmovdqu', vlddqu) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_copyi) + FUNC_ENTRY(3) + + cmp $7, n + jbe L(bc) + + test $8, R8(rp) C is rp 16-byte aligned? + jz L(a2) C jump if rp aligned + mov (up), %rax + lea 8(up), up + mov %rax, (rp) + lea 8(rp), rp + dec n +L(a2): test $16, R8(rp) C is rp 32-byte aligned? + jz L(a3) C jump if rp aligned + vmovdqu (up), %xmm0 + lea 16(up), up + vmovdqa %xmm0, (rp) + lea 16(rp), rp + sub $2, n +L(a3): sub $16, n + jc L(sma) + + ALIGN(16) +L(top): vmovdqu (up), %ymm0 + vmovdqu 32(up), %ymm1 + vmovdqu 64(up), %ymm2 + vmovdqu 96(up), %ymm3 + lea 128(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, 32(rp) + vmovdqa %ymm2, 64(rp) + vmovdqa %ymm3, 96(rp) + lea 128(rp), rp +L(ali): sub $16, n + jnc L(top) + +L(sma): test $8, R8(n) + jz 1f + vmovdqu (up), %ymm0 + vmovdqu 32(up), %ymm1 + lea 64(up), up + vmovdqa %ymm0, (rp) + vmovdqa %ymm1, 32(rp) + lea 64(rp), rp +1: + test $4, R8(n) + jz 1f + vmovdqu (up), %ymm0 + lea 32(up), up + vmovdqa %ymm0, (rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + vmovdqu (up), %xmm0 + lea 16(up), up + vmovdqa %xmm0, (rp) + lea 16(rp), rp +1: +L(end): test $1, R8(n) + jz 1f + mov (up), %r8 + mov %r8, (rp) +1: + FUNC_EXIT() + ret + + ALIGN(16) +L(bc): test $4, R8(n) + jz 1f + mov (up), %rax + mov 8(up), %rcx + mov 16(up), %r8 + mov 24(up), %r9 + lea 32(up), up + mov %rax, (rp) + mov %rcx, 8(rp) + mov %r8, 16(rp) + mov %r9, 24(rp) + lea 32(rp), rp +1: + test $2, R8(n) + jz 1f + mov (up), %rax + mov 8(up), %rcx + lea 16(up), up + mov %rax, (rp) + mov %rcx, 8(rp) + lea 16(rp), rp +1: + test $1, R8(n) + jz 1f + mov (up), %rax + mov %rax, (rp) +1: + FUNC_EXIT() + ret +EPILOGUE() |