aboutsummaryrefslogtreecommitdiff
path: root/vendor/gmp-6.3.0/mpn/x86_64/fastavx
diff options
context:
space:
mode:
authorThomas Voss <mail@thomasvoss.com> 2024-06-21 23:36:36 +0200
committerThomas Voss <mail@thomasvoss.com> 2024-06-21 23:42:26 +0200
commita89a14ef5da44684a16b204e7a70460cc8c4922a (patch)
treeb23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86_64/fastavx
parent1db63fcedab0b288820d66e100b1877b1a5a8851 (diff)
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86_64/fastavx')
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm181
-rw-r--r--vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm178
2 files changed, 359 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm
new file mode 100644
index 0000000..21ab210
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyd.asm
@@ -0,0 +1,181 @@
+dnl AMD64 mpn_copyd optimised for CPUs with fast AVX.
+
+dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 4.87 4.87 N
+C AMD bd3 ? ?
+C AMD bd4 0.53 ?
+C AMD zn1 0.51 ?
+C AMD zn2 0.25 ? Y
+C AMD zn3 0.25 ? Y
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel CNR n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel WSM n/a
+C Intel SBR 0.50 0.91 N
+C Intel IBR 0.50 0.65 N
+C Intel HWL 0.25 0.30 Y
+C Intel BWL 0.28 0.37 Y
+C Intel SKL 0.27 ? Y
+C Intel atom n/a
+C Intel SLM n/a
+C Intel GLM n/a
+C VIA nano n/a
+
+C We try to do as many 32-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. For the bulk copying, we
+C write using aligned 32-byte operations, but we read with both aligned and
+C unaligned 32-byte operations.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`vmovdqu', vlddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyd)
+ FUNC_ENTRY(3)
+
+ lea -32(rp,n,8), rp
+ lea -32(up,n,8), up
+
+ cmp $7, n C basecase needed for correctness
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(a2) C jump if rp aligned
+ mov 24(up), %rax
+ lea -8(up), up
+ mov %rax, 24(rp)
+ lea -8(rp), rp
+ dec n
+L(a2): test $16, R8(rp) C is rp 32-byte aligned?
+ jz L(a3) C jump if rp aligned
+ vmovdqu 16(up), %xmm0
+ lea -16(up), up
+ vmovdqa %xmm0, 16(rp)
+ lea -16(rp), rp
+ sub $2, n
+L(a3): sub $16, n
+ jc L(sma)
+
+ ALIGN(16)
+L(top): vmovdqu (up), %ymm0
+ vmovdqu -32(up), %ymm1
+ vmovdqu -64(up), %ymm2
+ vmovdqu -96(up), %ymm3
+ lea -128(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, -32(rp)
+ vmovdqa %ymm2, -64(rp)
+ vmovdqa %ymm3, -96(rp)
+ lea -128(rp), rp
+L(ali): sub $16, n
+ jnc L(top)
+
+L(sma): test $8, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ vmovdqu -32(up), %ymm1
+ lea -64(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, -32(rp)
+ lea -64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ lea -32(up), up
+ vmovdqa %ymm0, (rp)
+ lea -32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ vmovdqu 16(up), %xmm0
+ lea -16(up), up
+ vmovdqa %xmm0, 16(rp)
+ lea -16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov 24(up), %r8
+ mov %r8, 24(rp)
+1:
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(bc): test $4, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov 16(up), %rcx
+ mov 8(up), %r8
+ mov (up), %r9
+ lea -32(up), up
+ mov %rax, 24(rp)
+ mov %rcx, 16(rp)
+ mov %r8, 8(rp)
+ mov %r9, (rp)
+ lea -32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov 16(up), %rcx
+ lea -16(up), up
+ mov %rax, 24(rp)
+ mov %rcx, 16(rp)
+ lea -16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov 24(up), %rax
+ mov %rax, 24(rp)
+1:
+ FUNC_EXIT()
+ ret
+EPILOGUE()
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm b/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm
new file mode 100644
index 0000000..03c2440
--- /dev/null
+++ b/vendor/gmp-6.3.0/mpn/x86_64/fastavx/copyi.asm
@@ -0,0 +1,178 @@
+dnl AMD64 mpn_copyi optimised for CPUs with fast AVX.
+
+dnl Copyright 2003, 2005, 2007, 2011-2013, 2015 Free Software Foundation, Inc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 n/a
+C AMD bd2 4.87 4.87 N
+C AMD bd3 ? ?
+C AMD bd4 0.53 ?
+C AMD zn1 0.51 ?
+C AMD zn2 0.25 ? Y
+C AMD zn3 0.25 ? Y
+C AMD bt1 n/a
+C AMD bt2 n/a
+C Intel P4 n/a
+C Intel CNR n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel WSM n/a
+C Intel SBR 0.50 0.91 N
+C Intel IBR 0.50 0.65 N
+C Intel HWL 0.25 0.30 Y
+C Intel BWL 0.28 0.37 Y
+C Intel SKL 0.27 ? Y
+C Intel atom n/a
+C Intel SLM n/a
+C Intel GLM n/a
+C VIA nano n/a
+
+C We try to do as many 32-byte operations as possible. The top-most and
+C bottom-most writes might need 8-byte operations. For the bulk copying, we
+C write using aligned 32-byte operations, but we read with both aligned and
+C unaligned 32-byte operations.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n', `%rdx')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+dnl define(`vmovdqu', vlddqu)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_copyi)
+ FUNC_ENTRY(3)
+
+ cmp $7, n
+ jbe L(bc)
+
+ test $8, R8(rp) C is rp 16-byte aligned?
+ jz L(a2) C jump if rp aligned
+ mov (up), %rax
+ lea 8(up), up
+ mov %rax, (rp)
+ lea 8(rp), rp
+ dec n
+L(a2): test $16, R8(rp) C is rp 32-byte aligned?
+ jz L(a3) C jump if rp aligned
+ vmovdqu (up), %xmm0
+ lea 16(up), up
+ vmovdqa %xmm0, (rp)
+ lea 16(rp), rp
+ sub $2, n
+L(a3): sub $16, n
+ jc L(sma)
+
+ ALIGN(16)
+L(top): vmovdqu (up), %ymm0
+ vmovdqu 32(up), %ymm1
+ vmovdqu 64(up), %ymm2
+ vmovdqu 96(up), %ymm3
+ lea 128(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, 32(rp)
+ vmovdqa %ymm2, 64(rp)
+ vmovdqa %ymm3, 96(rp)
+ lea 128(rp), rp
+L(ali): sub $16, n
+ jnc L(top)
+
+L(sma): test $8, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ vmovdqu 32(up), %ymm1
+ lea 64(up), up
+ vmovdqa %ymm0, (rp)
+ vmovdqa %ymm1, 32(rp)
+ lea 64(rp), rp
+1:
+ test $4, R8(n)
+ jz 1f
+ vmovdqu (up), %ymm0
+ lea 32(up), up
+ vmovdqa %ymm0, (rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ vmovdqu (up), %xmm0
+ lea 16(up), up
+ vmovdqa %xmm0, (rp)
+ lea 16(rp), rp
+1:
+L(end): test $1, R8(n)
+ jz 1f
+ mov (up), %r8
+ mov %r8, (rp)
+1:
+ FUNC_EXIT()
+ ret
+
+ ALIGN(16)
+L(bc): test $4, R8(n)
+ jz 1f
+ mov (up), %rax
+ mov 8(up), %rcx
+ mov 16(up), %r8
+ mov 24(up), %r9
+ lea 32(up), up
+ mov %rax, (rp)
+ mov %rcx, 8(rp)
+ mov %r8, 16(rp)
+ mov %r9, 24(rp)
+ lea 32(rp), rp
+1:
+ test $2, R8(n)
+ jz 1f
+ mov (up), %rax
+ mov 8(up), %rcx
+ lea 16(up), up
+ mov %rax, (rp)
+ mov %rcx, 8(rp)
+ lea 16(rp), rp
+1:
+ test $1, R8(n)
+ jz 1f
+ mov (up), %rax
+ mov %rax, (rp)
+1:
+ FUNC_EXIT()
+ ret
+EPILOGUE()