From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- vendor/gmp-6.3.0/mpn/x86_64/bd1/README | 11 + vendor/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm | 235 ++++++++++++ vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm | 37 ++ vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm | 38 ++ vendor/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm | 37 ++ vendor/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm | 190 ++++++++++ vendor/gmp-6.3.0/mpn/x86_64/bd1/com.asm | 37 ++ vendor/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm | 37 ++ vendor/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm | 37 ++ vendor/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm | 37 ++ vendor/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h | 265 ++++++++++++++ vendor/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm | 206 +++++++++++ vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm | 193 ++++++++++ vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm | 195 ++++++++++ vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm | 416 ++++++++++++++++++++++ vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm | 191 ++++++++++ vendor/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm | 37 ++ vendor/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm | 37 ++ 18 files changed, 2236 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/README create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/com.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm (limited to 'vendor/gmp-6.3.0/mpn/x86_64/bd1') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/README b/vendor/gmp-6.3.0/mpn/x86_64/bd1/README new file mode 100644 index 0000000..ccd210e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/README @@ -0,0 +1,11 @@ +This directory contains code for AMD bulldozer including its piledriver update. + +We currently make limited use of SIMD instructions, both via the MPN_PATH and +via inclusion of x86_64/fastsse files. + +The bd1 cores share one SIMD/FPU pipeline for two integer units. This probably +means that an all-core GMP load (such as a HPC load) might run slower if there +is significant SIMD dependency. + +We should perhaps allow a special 'bd1nosimd' pseudo cpu-name excluding any +SIMD code. diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm new file mode 100644 index 0000000..b54e91a --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/addmul_2.asm @@ -0,0 +1,235 @@ +dnl AMD64 mpn_addmul_2 optimised for AMD Bulldozer. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 +C AMD K10 +C AMD bd1 4.2 +C AMD bd2 4.4 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bt1 +C AMD bt2 +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel SKL +C Intel atom +C Intel SLM +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`n', `%rcx') +define(`v0', `%rbx') +define(`v1', `%rbp') +define(`X0', `%r12') +define(`X1', `%r13') + +define(`w0', `%r8') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + mov $0, R32(w2) C abuse w2 + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + sub n_param, w2 + mul v0 + + test $1, R8(w2) + jnz L(bx1) + +L(bx0): mov %rdx, X0 + mov %rax, X1 + test $2, R8(w2) + jnz L(b10) + +L(b00): lea (w2), n C un = 4, 8, 12, ... + mov (up,w2,8), %rax + mov (rp,w2,8), w3 + mul v1 + mov %rax, w0 + mov 8(up,w2,8), %rax + mov %rdx, w1 + jmp L(lo0) + +L(b10): lea 2(w2), n C un = 2, 6, 10, ... + mov (up,w2,8), %rax + mov (rp,w2,8), w1 + mul v1 + mov %rdx, w3 + mov %rax, w2 + mov -8(up,n,8), %rax + test n, n + jz L(end) + jmp L(top) + +L(bx1): mov %rax, X0 + mov %rdx, X1 + test $2, R8(w2) + jz L(b11) + +L(b01): lea 1(w2), n C un = 1, 5, 9, ... + mov (up,w2,8), %rax + mul v1 + mov (rp,w2,8), w2 + mov %rdx, w0 + mov %rax, w3 + jmp L(lo1) + +L(b11): lea -1(w2), n C un = 3, 7, 11, ... + mov (up,w2,8), %rax + mul v1 + mov (rp,w2,8), w0 + mov %rax, w1 + mov 8(up,w2,8), %rax + mov %rdx, w2 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,n,8) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up,n,8), %rax + mul v1 + mov -8(rp,n,8), w1 + mov %rdx, w0 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X0, -8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + mov (up,n,8), %rax + adc $0, X0 + mov (rp,n,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(lo0): mul v0 + add w3, X1 + mov X1, (rp,n,8) + adc %rax, X0 + mov 8(up,n,8), %rax + mov %rdx, X1 + adc $0, X1 + mov 8(rp,n,8), w3 + mul v1 + add w3, w0 + adc %rax, w1 + mov 16(up,n,8), %rax + mov %rdx, w2 + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + adc $0, X0 + mov 16(up,n,8), %rax + mov 16(rp,n,8), w0 + mul v1 + mov %rdx, w3 + add w0, w1 + adc %rax, w2 + adc $0, w3 + mov 24(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + add w1, X1 + mov X1, -16(rp) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up), %rax + mul v1 + mov -8(rp), w1 + add w1, w2 + adc %rax, w3 + adc $0, %rdx + add w2, X0 + adc $0, X1 + mov X0, -8(rp) + add w3, X1 + mov X1, (rp) + adc $0, %rdx + mov %rdx, %rax + + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm new file mode 100644 index 0000000..c34a5fa --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh1_n.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_addlsh1_n and mpn_rsblsh1_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) +include_mpn(`x86_64/atom/aorrlsh1_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm new file mode 100644 index 0000000..5516c9d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorrlsh_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_addlsh_n and mpn_rsblsh_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) +include_mpn(`x86_64/aorrlsh_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm new file mode 100644 index 0000000..143c42e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/aors_n.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Silvermont. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) +include_mpn(`x86_64/coreihwl/aors_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm new file mode 100644 index 0000000..fc0d2fe --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/aorsmul_1.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD Bulldozer. + +dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.30 3.58 +C AMD K10 3.09 +C AMD bull 4.47 4.72 +C AMD pile 4.66 +C AMD steam +C AMD excavator +C AMD bobcat 6.30 +C AMD jaguar 6.29 +C Intel P4 17.3 17.8 +C Intel core2 5.13 +C Intel NHM 4.85 +C Intel SBR 3.83 +C Intel IBR 3.75 +C Intel HWL 3.45 +C Intel BWL 2.56 +C Intel SKL 2.53 +C Intel atom 20.3 +C Intel SLM 9 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Try to make loop run closer to 4 c/l in Bulldozer and Piledriver. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%r11') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`v0', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``r11'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %rbx ') C move away n from rdx, mul uses it +IFDOS(` mov n, %rbx ') + mul v0 + +IFSTD(` mov %rbx, n ') + + and $3, R32(%rbx) + lea -16(rp,n,8), rp + jz L(b0) + cmp $2, R32(%rbx) + jb L(b1) + jz L(b2) + +L(b3): mov $0, R32(%r8) + mov %rax, %rbx + mov $0, R32(%r9) + mov 8(up), %rax + mov %rdx, %r10 + lea (up,n,8), up + not n + jmp L(L3) + +L(b0): mov $0, R32(%r10) + mov %rax, %r8 + mov %rdx, %rbx + mov 8(up), %rax + lea (up,n,8), up + neg n + jmp L(L0) + +L(b1): cmp $1, n + jz L(n1) + mov %rax, %r9 + mov 8(up), %rax + mov %rdx, %r8 + mov $0, R32(%rbx) + lea (up,n,8), up + neg n + inc n + jmp L(L1) + +L(b2): mov $0, R32(%rbx) + mov %rax, %r10 + mov %rdx, %r9 + mov 8(up), %rax + mov $0, R32(%r8) + lea (up,n,8), up + neg n + add $2, n + jns L(end) + + ALIGN(32) +L(top): mul v0 + ADDSUB %r10, (rp,n,8) + adc %rax, %r9 + mov (up,n,8), %rax + adc %rdx, %r8 +L(L1): mul v0 + mov $0, R32(%r10) + ADDSUB %r9, 8(rp,n,8) + adc %rax, %r8 + adc %rdx, %rbx + mov 8(up,n,8), %rax +L(L0): mul v0 + ADDSUB %r8, 16(rp,n,8) + mov $0, R32(%r8) + adc %rax, %rbx + mov $0, R32(%r9) + mov 16(up,n,8), %rax + adc %rdx, %r10 +L(L3): mul v0 + ADDSUB %rbx, 24(rp,n,8) + mov $0, R32(%rbx) + adc %rax, %r10 + adc %rdx, %r9 + mov 24(up,n,8), %rax + add $4, n + js L(top) + +L(end): mul v0 + ADDSUB %r10, (rp) + adc %r9, %rax + adc %r8, %rdx +L(n1): ADDSUB %rax, 8(rp) + adc $0, %rdx + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/com.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/com.asm new file mode 100644 index 0000000..43f3561 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm new file mode 100644 index 0000000..675cdc3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm new file mode 100644 index 0000000..ceef036 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for AMD bd1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm new file mode 100644 index 0000000..4723093 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/gcd_11.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_gcd_11. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_gcd_11) +include_mpn(`x86_64/core2/gcd_11.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h new file mode 100644 index 0000000..210f382 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/gmp-mparam.h @@ -0,0 +1,265 @@ +/* AMD bd1 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +/* 3600-3800 MHz Bulldozer Zambezi */ +/* FFT tuning limit = 464,627,200 */ +/* Generated by tuneup.c, 2019-10-20, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 31 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 2 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 27 + +#define DIV_1_VS_MUL_1_PERCENT 275 + +#define MUL_TOOM22_THRESHOLD 20 +#define MUL_TOOM33_THRESHOLD 57 +#define MUL_TOOM44_THRESHOLD 161 +#define MUL_TOOM6H_THRESHOLD 226 +#define MUL_TOOM8H_THRESHOLD 339 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 61 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 108 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 105 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 113 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 91 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 24 +#define SQR_TOOM3_THRESHOLD 85 +#define SQR_TOOM4_THRESHOLD 234 +#define SQR_TOOM6_THRESHOLD 286 +#define SQR_TOOM8_THRESHOLD 466 + +#define MULMID_TOOM42_THRESHOLD 20 + +#define MULMOD_BNM1_THRESHOLD 12 +#define SQRMOD_BNM1_THRESHOLD 15 + +#define MUL_FFT_MODF_THRESHOLD 412 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 412, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 25, 7}, { 13, 6}, \ + { 28, 7}, { 25, 8}, { 13, 7}, { 28, 8}, \ + { 15, 7}, { 32, 8}, { 17, 7}, { 35, 8}, \ + { 19, 7}, { 39, 8}, { 27, 9}, { 15, 8}, \ + { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ + { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ + { 23, 9}, { 55,11}, { 15,10}, { 31, 9}, \ + { 71,10}, { 39, 9}, { 83,10}, { 47, 9}, \ + { 99,10}, { 55,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 103,12}, { 31,11}, { 63, 7}, \ + { 1023, 8}, { 543, 9}, { 303,10}, { 167,11}, \ + { 95,10}, { 191,12}, { 63,11}, { 127,10}, \ + { 255,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 271,10}, { 543,11}, \ + { 287,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \ + { 191,12}, { 383,11}, { 767,12}, { 415,11}, \ + { 831,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 511,11}, { 1023,12}, { 543,11}, { 1087,10}, \ + { 2175,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 639,11}, { 1279,12}, { 671,11}, \ + { 1343,10}, { 2687,12}, { 703,11}, { 1407,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 799,11}, \ + { 1599,12}, { 831,13}, { 447,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1087,11}, { 2175,13}, { 575,12}, { 1215,11}, \ + { 2431,13}, { 639,12}, { 1343,11}, { 2687,13}, \ + { 703,12}, { 1407,14}, { 383,13}, { 767,12}, \ + { 1599,13}, { 831,12}, { 1727,11}, { 3455,13}, \ + { 895,15}, { 255,14}, { 511,13}, { 1023,12}, \ + { 2047,13}, { 1087,12}, { 2175,13}, { 1215,12}, \ + { 2431,11}, { 4863,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,14}, { 895,13}, { 1919,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2815,12}, { 5631,13}, { 2943,12}, { 5887,15}, \ + { 767,14}, { 1535,13}, { 3199,14}, { 1663,13}, \ + { 3455,12}, { 6911,14}, { 1791,13}, { 3583,14}, \ + { 1919,13}, { 3839,16}, { 511,15}, { 1023,14}, \ + { 2175,13}, { 4479,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,13}, { 5887,12}, { 11775,15}, \ + { 1535,14}, { 3455,13}, { 6911,15}, { 1791,14}, \ + { 3839,13}, { 7679,16}, { 1023,15}, { 2047,14}, \ + { 4479,15}, { 2303,14}, { 4863,15}, { 2559,14}, \ + { 5247,15}, { 2815,14}, { 5887,13}, { 11775,16}, \ + { 1535,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \ + { 4351,14}, { 8959,15}, { 4863,16}, { 2559,15}, \ + { 5887,14}, { 11775,16}, { 3071,15}, { 6911,16}, \ + { 3583,15}, { 7679,14}, { 15359,15}, { 7935,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 251 +#define MUL_FFT_THRESHOLD 4544 + +#define SQR_FFT_MODF_THRESHOLD 364 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 364, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ + { 11, 5}, { 23, 6}, { 12, 5}, { 25, 6}, \ + { 25, 7}, { 13, 6}, { 27, 7}, { 25, 8}, \ + { 13, 7}, { 28, 8}, { 15, 7}, { 31, 8}, \ + { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ + { 27, 9}, { 15, 8}, { 35, 9}, { 19, 8}, \ + { 41, 9}, { 23, 8}, { 47, 9}, { 27,10}, \ + { 15, 9}, { 39,10}, { 23, 9}, { 51,11}, \ + { 15,10}, { 31, 9}, { 67,10}, { 39, 9}, \ + { 79,10}, { 47,11}, { 31,10}, { 79,11}, \ + { 47,10}, { 95,12}, { 31,11}, { 63,10}, \ + { 135,11}, { 79,10}, { 159,11}, { 95, 7}, \ + { 1535, 8}, { 799, 7}, { 1599, 8}, { 831, 9}, \ + { 447,10}, { 239,11}, { 127,10}, { 255,11}, \ + { 143,10}, { 303,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 303,12}, { 159,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,12}, { 223,11}, { 447,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,12}, { 287,11}, \ + { 575,10}, { 1151,11}, { 607,12}, { 319,11}, \ + { 639,10}, { 1279,11}, { 671,12}, { 351,13}, \ + { 191,12}, { 383,11}, { 767,10}, { 1535,12}, \ + { 415,11}, { 831,12}, { 447,14}, { 127,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 543,11}, \ + { 1087,10}, { 2175,12}, { 575,11}, { 1151,12}, \ + { 607,13}, { 319,12}, { 639,11}, { 1279,12}, \ + { 671,11}, { 1343,12}, { 703,11}, { 1407,12}, \ + { 735,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 799,11}, { 1599,12}, { 831,13}, { 447,12}, \ + { 895,14}, { 255,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1087,11}, { 2175,13}, { 575,12}, \ + { 1151,11}, { 2303,12}, { 1215,11}, { 2431,13}, \ + { 639,12}, { 1343,13}, { 703,12}, { 1407,14}, \ + { 383,13}, { 767,12}, { 1599,11}, { 3199,13}, \ + { 831,12}, { 1727,11}, { 3455,13}, { 895,15}, \ + { 255,14}, { 511,13}, { 1023,12}, { 2047,13}, \ + { 1087,12}, { 2175,13}, { 1151,12}, { 2303,13}, \ + { 1215,12}, { 2431,14}, { 639,13}, { 1343,12}, \ + { 2687,13}, { 1471,12}, { 2943,11}, { 5887,14}, \ + { 767,13}, { 1599,12}, { 3199,13}, { 1727,12}, \ + { 3455,11}, { 6911,14}, { 895,13}, { 1791,12}, \ + { 3583,13}, { 1919,12}, { 3839,15}, { 511,14}, \ + { 1023,13}, { 2175,14}, { 1151,13}, { 2431,12}, \ + { 4863,14}, { 1279,13}, { 2687,14}, { 1407,13}, \ + { 2943,12}, { 5887,11}, { 11775,15}, { 767,14}, \ + { 1535,13}, { 3199,14}, { 1663,13}, { 3455,12}, \ + { 6911,14}, { 1791,13}, { 3583,14}, { 1919,13}, \ + { 3839,16}, { 511,15}, { 1023,14}, { 2175,13}, \ + { 4351,12}, { 8703,13}, { 4479,12}, { 8959,14}, \ + { 2303,13}, { 4607,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2815,13}, { 5631,14}, { 2943,13}, \ + { 5887,12}, { 11775,15}, { 1535,14}, { 3455,13}, \ + { 6911,15}, { 1791,14}, { 3839,13}, { 7679,16}, \ + { 1023,15}, { 2047,14}, { 4351,13}, { 8703,14}, \ + { 4479,13}, { 8959,15}, { 2303,14}, { 4991,13}, \ + { 9983,15}, { 2559,14}, { 5119,15}, { 2815,14}, \ + { 5887,13}, { 11775,16}, { 1535,15}, { 3071,14}, \ + { 6143,15}, { 3327,14}, { 6911,15}, { 3839,14}, \ + { 7679,13}, { 15359,17}, { 1023,16}, { 2047,15}, \ + { 4095,14}, { 8191,15}, { 4351,14}, { 8959,15}, \ + { 4863,14}, { 9983,16}, { 2559,15}, { 5887,14}, \ + { 11775,16}, { 3071,15}, { 6911,16}, { 3583,15}, \ + { 7679,14}, { 15359,15}, { 7935,14}, { 15871,17}, \ + { 2047,16}, { 4095,15}, { 8959,16}, { 4607,15}, \ + { 9983,14}, { 19967,16}, { 5119,15}, { 10239,16}, \ + { 5631,15}, { 11775,17}, { 3071,16}, { 65536,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 275 +#define SQR_FFT_THRESHOLD 3264 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 23 +#define MULLO_MUL_N_THRESHOLD 8907 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 6440 + +#define DC_DIV_QR_THRESHOLD 52 +#define DC_DIVAPPR_Q_THRESHOLD 167 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 93 + +#define INV_MULMOD_BNM1_THRESHOLD 38 +#define INV_NEWTON_THRESHOLD 197 +#define INV_APPR_THRESHOLD 179 + +#define BINV_NEWTON_THRESHOLD 230 +#define REDC_1_TO_REDC_2_THRESHOLD 32 +#define REDC_2_TO_REDC_N_THRESHOLD 55 + +#define MU_DIV_QR_THRESHOLD 1387 +#define MU_DIVAPPR_Q_THRESHOLD 1387 +#define MUPI_DIV_QR_THRESHOLD 92 +#define MU_BDIV_QR_THRESHOLD 1142 +#define MU_BDIV_Q_THRESHOLD 1334 + +#define POWM_SEC_TABLE 1,22,194,434,452 + +#define GET_STR_DC_THRESHOLD 13 +#define GET_STR_PRECOMPUTE_THRESHOLD 20 +#define SET_STR_DC_THRESHOLD 438 +#define SET_STR_PRECOMPUTE_THRESHOLD 1254 + +#define FAC_DSC_THRESHOLD 189 +#define FAC_ODD_THRESHOLD 26 + +#define MATRIX22_STRASSEN_THRESHOLD 14 +#define HGCD2_DIV1_METHOD 3 /* 2.31% faster than 4 */ +#define HGCD_THRESHOLD 104 +#define HGCD_APPR_THRESHOLD 52 +#define HGCD_REDUCE_THRESHOLD 2681 +#define GCD_DC_THRESHOLD 465 +#define GCDEXT_DC_THRESHOLD 283 +#define JACOBI_BASE_METHOD 4 /* 5.81% faster than 1 */ + +/* Tuneup completed successfully, took 554602 seconds */ diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm new file mode 100644 index 0000000..799cdda --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/hamdist.asm @@ -0,0 +1,206 @@ +dnl AMD64 SSSE3/XOP mpn_hamdist -- hamming distance. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.51-2.0 y +C AMD bd2 1.50-1.9 y +C AMD bd3 ? +C AMD bd4 ? +C AMD zen n/a +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL n/a +C Intel SKL n/a +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we +C intend to support old systems. + +C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some +C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. +C We fall back to the core2 code. +ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` +MULFUNC_PROLOGUE(mpn_hamdist) +include_mpn(`x86_64/core2/hamdist.asm') +',` + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + FUNC_ENTRY(3) + cmp $5, n + jl L(sma) + + lea L(cnsts)(%rip), %r9 + + xor R32(%r10), R32(%r10) + test $8, R8(vp) + jz L(ali) + mov (up), %r8 + xor (vp), %r8 + add $8, up + add $8, vp + dec n + popcnt %r8, %r10 +L(ali): + +ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)', + `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)') + movdqa OFF1`'(%r9), %xmm7 C nibble counts table + movdqa OFF2`'(%r9), %xmm6 C splat shift counts + movdqa OFF3`'(%r9), %xmm5 C masks + pxor %xmm4, %xmm4 + pxor %xmm8, %xmm8 C grand total count + + mov R32(n), R32(%rax) + and $6, R32(%rax) + lea -64(up,%rax,8), up + lea -64(vp,%rax,8), vp +ifdef(`PIC',` + movslq (%r9,%rax,2), %r11 + add %r9, %r11 + jmp *%r11 +',` + jmp *(%r9,%rax,4) +') + +L(0): add $64, up + add $64, vp + sub $2, n + + ALIGN(32) +L(top): lddqu (up), %xmm0 + pxor (vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(6): lddqu 16(up), %xmm0 + pxor 16(vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(4): lddqu 32(up), %xmm0 + pxor 32(vp), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm5, %xmm0 + pand %xmm5, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 + .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 + paddb %xmm2, %xmm3 + paddb %xmm2, %xmm4 + paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts +L(2): mov 48(up), %r8 + mov 56(up), %r9 + add $64, up + xor 48(vp), %r8 + xor 56(vp), %r9 + add $64, vp + popcnt %r8, %r8 + popcnt %r9, %r9 + add %r8, %r10 + add %r9, %r10 + sub $8, n + jg L(top) + + test $1, R8(n) + jz L(x) + mov (up), %r8 + xor (vp), %r8 + popcnt %r8, %r8 + add %r8, %r10 +L(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 + paddq %xmm0, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + add %r10, %rax + FUNC_EXIT() + ret + +L(sma): mov (up), %r8 + xor (vp), %r8 + popcnt %r8, %rax + dec n + jz L(ed) +L(tp): mov 8(up), %r8 + add $8, up + xor 8(vp), %r8 + add $8, vp + popcnt %r8, %r8 + add %r8, %rax + dec n + jnz L(tp) +L(ed): FUNC_EXIT() + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(0), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(6), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) +') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm new file mode 100644 index 0000000..2fb097f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_1.asm @@ -0,0 +1,193 @@ +dnl AMD64 mpn_mul_1 optimised for AMD Bulldozer. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.65 +C AMD K10 3.30 3.68 +C AMD bull 4.04 4.29 +C AMD pile 4.33 +C AMD steam +C AMD excavator +C AMD bobcat 5.73 +C AMD jaguar 5.87 +C Intel P4 12.5 +C Intel core2 4.38 +C Intel NHM 4.28 +C Intel SBR 2.69 +C Intel IBR 2.55 +C Intel HWL 2.41 +C Intel BWL 2.49 +C Intel SKL 2.50 +C Intel atom 20.3 +C Intel SLM 7.8 +C VIA nano 4.25 + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Move loop code into feed-in blocks, to save insn for zeroing regs. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +IFDOS(` define(`up', ``%rsi'') ') dnl +IFDOS(` define(`rp', ``%rcx'') ') dnl +IFDOS(` define(`v0', ``%r9'') ') dnl +IFDOS(` define(`r9', ``rdi'') ') dnl +IFDOS(` define(`n', ``%r8'') ') dnl +IFDOS(` define(`r8', ``rbx'') ') dnl + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1c) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it +IFDOS(` mov n, %r11 ') + mul v0 + +IFSTD(` add %r8, %rax ') +IFDOS(` add 64(%rsp), %rax ') C 40 + 3*8 (3 push insns) + adc $0, %rdx + jmp L(common) + +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mul_1) +IFDOS(``push %rsi '') +IFDOS(``push %rdi '') +IFDOS(``mov %rdx, %rsi '') + + mov (up), %rax C read first u limb early + push %rbx +IFSTD(` mov n_param, %r11 ') C move away n from rdx, mul uses it +IFDOS(` mov n, %r11 ') + mul v0 + +L(common): +IFSTD(` mov %r11, n ') + + and $3, R32(%r11) + lea -16(rp,n,8), rp + jz L(b0) + cmp $2, R32(%r11) + jb L(b1) + jz L(b2) + +L(b3): mov %rax, %r10 + mov %rdx, %r11 + mov 8(up), %rax + mul v0 + lea (up,n,8), up + not n + jmp L(L3) + +L(b0): mov %rax, %r9 + mov %rdx, %r10 + mov 8(up), %rax + lea (up,n,8), up + neg n + jmp L(L0) + +L(b1): mov %rax, %r8 + cmp $1, n + jz L(n1) + mov %rdx, %r9 + lea (up,n,8), up + neg n + mov %r8, 16(rp,n,8) + inc n + jmp L(L1) + +L(b2): mov %rax, %r11 + mov %rdx, %r8 + mov 8(up), %rax + lea (up,n,8), up + neg n + add $2, n + jns L(end) + + ALIGN(16) +L(top): mul v0 + mov %rdx, %r9 + add %rax, %r8 + adc $0, %r9 + mov %r8, 8(rp,n,8) + mov %r11, (rp,n,8) +L(L1): mov (up,n,8), %rax + mul v0 + add %rax, %r9 + mov %rdx, %r10 + mov 8(up,n,8), %rax + adc $0, %r10 +L(L0): mul v0 + add %rax, %r10 + mov %rdx, %r11 + mov 16(up,n,8), %rax + adc $0, %r11 + mul v0 + mov %r9, 16(rp,n,8) +L(L3): add %rax, %r11 + mov %r10, 24(rp,n,8) + mov %rdx, %r8 + adc $0, %r8 + add $4, n + mov -8(up,n,8), %rax + js L(top) + +L(end): mul v0 + add %rax, %r8 + adc $0, %rdx + mov %r11, (rp) +L(n1): mov %r8, 8(rp) + mov %rdx, %rax + + pop %rbx +IFDOS(``pop %rdi '') +IFDOS(``pop %rsi '') + ret +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm new file mode 100644 index 0000000..85fa7aa --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_2.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_mul_2 optimised for AMD Bulldozer. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 6.78 +C AMD K10 6.78 +C AMD bd1 8.39 8.65 +C AMD bd2 8.47 +C AMD bd3 +C AMD bd4 +C AMD zen +C AMD bt1 12.1 +C AMD bt2 11.5 +C Intel P4 24.0 +C Intel PNR 8.14 +C Intel NHM 7.78 +C Intel SBR 6.34 +C Intel IBR 6.15 +C Intel HWL 6.04 +C Intel BWL 4.33 +C Intel SKL 4.41 +C Intel atom 39.5 +C Intel SLM 27.8 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + lea (up,n_param,8), up + lea (rp,n_param,8), rp + + mov n_param, n + mul v0 + neg n + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov (up,n,8), %rax + jmp L(lo0) + +L(b10): mov %rax, w2 + mov %rdx, w3 + mov (up,n,8), %rax + xor R32(w0), R32(w0) + mul v1 + add $-2, n + jmp L(lo2) + +L(bx1): test $2, R8(n) + jz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up,n,8), %rax + mul v1 + xor R32(w1), R32(w1) + inc n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov %rdx, w2 + mov (up,n,8), %rax + xor R32(w3), R32(w3) + dec n + jmp L(lo3) + + ALIGN(32) +L(top): mov -8(up,n,8), %rax + mul v1 + mov w2, -16(rp,n,8) +L(lo1): add %rax, w0 + mov w3, -8(rp,n,8) + adc %rdx, w1 + mov (up,n,8), %rax + mul v0 + mov $0, R32(w2) + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) + mov (up,n,8), %rax +L(lo0): mul v1 + add %rax, w1 + adc %rdx, w2 + mov 8(up,n,8), %rax + mul v0 + add %rax, w1 + mov w0, (rp,n,8) + mov $0, R32(w3) + mov 8(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(lo3): mul v1 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up,n,8), %rax + mov $0, R32(w0) + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + mov w1, 8(rp,n,8) +L(lo2): add %rax, w3 + adc %rdx, w0 + mov 24(up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + mov $0, R32(w1) + adc $0, R32(w1) + add $4, n + jnc L(top) + +L(end): mov -8(up), %rax + mul v1 + mov w2, -16(rp) + add %rax, w0 + mov w3, -8(rp) + adc %rdx, w1 + mov w0, (rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm new file mode 100644 index 0000000..e47ba58 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/mul_basecase.asm @@ -0,0 +1,416 @@ +dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb mul_1 mul_2 mul_3 addmul_2 +C AMD K8,K9 +C AMD K10 +C AMD bull ~4.8 ~4.55 - ~4.3 +C AMD pile ~4.6 ~4.55 - ~4.55 +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel core +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom +C VIA nano + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Merge bull-specific mul_1, if it is not slower the TOOM22 range. +C Alternatively, we could tweak the present code (which was loopmixed for a +C different CPU). +C * Merge faster mul_2, such as the one in the same directory as this file. +C * Further micro-optimise. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`un', `%rbx') + +define(`w0', `%r10') +define(`w1', `%r11') +define(`w2', `%r12') +define(`w3', `%r13') +define(`n', `%rbp') +define(`v0', `%r9') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + mov un_param, un C free up rdx + neg un + + mov (up), %rax C shared for mul_1 and mul_2 + lea (up,un_param,8), up C point at operand end + lea (rp,un_param,8), rp C point at rp[un-1] + + mov (vp), v0 C shared for mul_1 and mul_2 + mul v0 C shared for mul_1 and mul_2 + + test $1, R8(vn) + jz L(do_mul_2) + +L(do_mul_1): + test $1, R8(un) + jnz L(m1x1) + +L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... + mov %rdx, w1 + mov 8(up,un,8), %rax + test $2, R8(un) + jnz L(m110) + +L(m100):lea 2(un), n C un = 4, 8, 12, ... + jmp L(m1l0) + +L(m110):lea (un), n C un = 2, 6, 10, ... + jmp L(m1l2) + +L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... + mov %rdx, w0 + test $2, R8(un) + jz L(m111) + +L(m101):lea 3(un), n C un = 1, 5, 9, ... + test n, n + js L(m1l1) + mov %rax, -8(rp) + mov %rdx, (rp) + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(m111):lea 1(un), n C un = 3, 7, 11, ... + mov 8(up,un,8), %rax + jmp L(m1l3) + + ALIGN(16) +L(m1tp):mov %rdx, w0 + add %rax, w1 +L(m1l1):mov -16(up,n,8), %rax + adc $0, w0 + mul v0 + add %rax, w0 + mov w1, -24(rp,n,8) + mov -8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(m1l0):mul v0 + mov w0, -16(rp,n,8) + add %rax, w1 + mov %rdx, w0 + mov (up,n,8), %rax + adc $0, w0 +L(m1l3):mul v0 + mov w1, -8(rp,n,8) + mov %rdx, w1 + add %rax, w0 + mov 8(up,n,8), %rax + adc $0, w1 +L(m1l2):mul v0 + mov w0, (rp,n,8) + add $4, n + jnc L(m1tp) + +L(m1ed):add %rax, w1 + adc $0, %rdx + mov w1, I(-8(rp),-24(rp,n,8)) + mov %rdx, I((rp),-16(rp,n,8)) + + dec R32(vn) + jz L(ret2) + + lea 8(vp), vp + lea 8(rp), rp + push %r12 + push %r13 + push %r14 + jmp L(do_addmul) + +L(do_mul_2): +define(`v1', `%r14') + push %r12 + push %r13 + push %r14 + + mov 8(vp), v1 + + test $1, R8(un) + jnz L(m2b1) + +L(m2b0):lea (un), n + mov %rax, w2 C 0 + mov (up,un,8), %rax + mov %rdx, w1 C 1 + mul v1 + mov %rax, w0 C 1 + mov w2, (rp,un,8) C 0 + mov 8(up,un,8), %rax + mov %rdx, w2 C 2 + jmp L(m2l0) + +L(m2b1):lea 1(un), n + mov %rax, w0 C 1 + mov %rdx, w3 C 2 + mov (up,un,8), %rax + mul v1 + mov w0, (rp,un,8) C 1 + mov %rdx, w0 C 3 + mov %rax, w2 C 0 + mov 8(up,un,8), %rax + jmp L(m2l1) + + ALIGN(32) +L(m2tp):add %rax, w2 C 0 + mov (up,n,8), %rax + adc $0, w0 C 1 +L(m2l1):mul v0 + add %rax, w2 C 0 + mov (up,n,8), %rax + mov %rdx, w1 C 1 + adc $0, w1 C 1 + mul v1 + add w3, w2 C 0 + adc $0, w1 C 1 + add %rax, w0 C 1 + mov w2, (rp,n,8) C 0 + mov 8(up,n,8), %rax + mov %rdx, w2 C 2 + adc $0, w2 C 2 +L(m2l0):mul v0 + add %rax, w0 C 1 + mov %rdx, w3 C 2 + adc $0, w3 C 2 + add w1, w0 C 1 + adc $0, w3 C 2 + mov 8(up,n,8), %rax + mul v1 + add $2, n + mov w0, -8(rp,n,8) C 1 + mov %rdx, w0 C 3 + jnc L(m2tp) + +L(m2ed):add %rax, w2 + adc $0, %rdx + add w3, w2 + adc $0, %rdx + mov w2, I((rp),(rp,n,8)) + mov %rdx, I(8(rp),8(rp,n,8)) + + add $-2, R32(vn) + jz L(ret5) + + lea 16(vp), vp + lea 16(rp), rp + + +L(do_addmul): + push %r15 + push vn C save vn in new stack slot +define(`vn', `(%rsp)') +define(`X0', `%r14') +define(`X1', `%r15') +define(`v1', `%r8') + +L(outer): + mov (vp), v0 + mov 8(vp), v1 + + mov (up,un,8), %rax + mul v0 + + test $1, R8(un) + jnz L(bx1) + +L(bx0): mov %rax, X1 + mov (up,un,8), %rax + mov %rdx, X0 + mul v1 + test $2, R8(un) + jnz L(b10) + +L(b00): lea (un), n C un = 4, 8, 12, ... + mov (rp,un,8), w3 + mov %rax, w0 + mov 8(up,un,8), %rax + mov %rdx, w1 + jmp L(lo0) + +L(b10): lea 2(un), n C un = 2, 6, 10, ... + mov (rp,un,8), w1 + mov %rdx, w3 + mov %rax, w2 + mov 8(up,un,8), %rax + jmp L(lo2) + +L(bx1): mov %rax, X0 + mov (up,un,8), %rax + mov %rdx, X1 + mul v1 + test $2, R8(un) + jz L(b11) + +L(b01): lea 1(un), n C un = 1, 5, 9, ... + mov (rp,un,8), w2 + mov %rdx, w0 + mov %rax, w3 + jmp L(lo1) + +L(b11): lea -1(un), n C un = 3, 7, 11, ... + mov (rp,un,8), w0 + mov %rax, w1 + mov 8(up,un,8), %rax + mov %rdx, w2 + jmp L(lo3) + + ALIGN(32) +L(top): +L(lo2): mul v0 + add w1, X1 + mov X1, -16(rp,n,8) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov -8(up,n,8), %rax + mul v1 + mov -8(rp,n,8), w1 + mov %rdx, w0 + add w1, w2 + adc %rax, w3 + adc $0, w0 +L(lo1): mov (up,n,8), %rax + mul v0 + add w2, X0 + mov X0, -8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + mov (up,n,8), %rax + adc $0, X0 + mov (rp,n,8), w2 + mul v1 + add w2, w3 + adc %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + adc $0, w1 +L(lo0): mul v0 + add w3, X1 + mov X1, (rp,n,8) + adc %rax, X0 + mov 8(up,n,8), %rax + mov %rdx, X1 + adc $0, X1 + mov 8(rp,n,8), w3 + mul v1 + add w3, w0 + adc %rax, w1 + mov 16(up,n,8), %rax + mov %rdx, w2 + adc $0, w2 +L(lo3): mul v0 + add w0, X0 + mov X0, 8(rp,n,8) + mov %rdx, X0 + adc %rax, X1 + adc $0, X0 + mov 16(up,n,8), %rax + mov 16(rp,n,8), w0 + mul v1 + mov %rdx, w3 + add w0, w1 + adc %rax, w2 + adc $0, w3 + mov 24(up,n,8), %rax + add $4, n + jnc L(top) + +L(end): mul v0 + add w1, X1 + mov X1, I(-16(rp),-16(rp,n,8)) + mov %rdx, X1 + adc %rax, X0 + adc $0, X1 + mov I(-8(up),-8(up,n,8)), %rax + mul v1 + mov I(-8(rp),-8(rp,n,8)), w1 + add w1, w2 + adc %rax, w3 + adc $0, %rdx + add w2, X0 + adc $0, X1 + mov X0, I(-8(rp),-8(rp,n,8)) + add w3, X1 + mov X1, I((rp),(rp,n,8)) + adc $0, %rdx + mov %rdx, I(8(rp),8(rp,n,8)) + + + addl $-2, vn + lea 16(vp), vp + lea 16(rp), rp + jnz L(outer) + + pop %rax C deallocate vn slot + pop %r15 +L(ret5):pop %r14 + pop %r13 + pop %r12 +L(ret2):pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm new file mode 100644 index 0000000..7b084f4 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm @@ -0,0 +1,191 @@ +dnl AMD64 SSSE3/XOP mpn_popcount -- population count. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 1.27 y +C AMD bd2 1.24 y +C AMD bd3 ? +C AMD bd4 1.22 +C AMD zen n/a +C AMD bobcat n/a +C AMD jaguar n/a +C Intel P4 n/a +C Intel CNR n/a +C Intel PNR n/a +C Intel NHM n/a +C Intel SBR n/a +C Intel IBR n/a +C Intel HWL n/a +C Intel BWL n/a +C Intel SKL n/a +C Intel atom n/a +C Intel SLM n/a +C VIA nano n/a + +C TODO +C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we +C intend to support old systems. + +C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some +C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. +C We fall back to the core2 code. +ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86_64/core2/popcount.asm') +',` + +define(`up', `%rdi') +define(`n', `%rsi') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + FUNC_ENTRY(3) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)', + `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)') + movdqa OFF1`'(%r9), %xmm7 C nibble counts table + movdqa OFF2`'(%r9), %xmm6 C splat shift counts + movdqa OFF3`'(%r9), %xmm9 C masks + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 C 0-reg + pxor %xmm8, %xmm8 C grand total count + + xor R32(%rdx), R32(%rdx) + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up),%rdx + add $8, up + dec n + jnz L(top) + mov %rdx, %rax + FUNC_EXIT() + ret + +L(2): add $-48, up + jmp L(e2) + +L(3): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-40, up + jmp L(e2) + +L(4): add $-32, up + jmp L(e4) + +L(5): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-24, up + jmp L(e4) + +L(6): add $-16, up + jmp L(e6) + +L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx + add $-8, up + jmp L(e6) + + ALIGN(32) +L(top): lddqu (up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 + .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm0 + .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 + pand %xmm9, %xmm0 + pand %xmm9, %xmm1 + .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2 + .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 + .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 + paddb %xmm2, %xmm4 +L(e2): popcnt 48(up), %r8 + popcnt 56(up), %r9 + add $64, up + paddq %xmm5, %xmm8 C sum to 2 x 64-bit counts + add %r8, %rdx + add %r9, %rdx + sub $8, n + jg L(top) + + .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 + paddq %xmm5, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + add %rdx, %rax + FUNC_EXIT() + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte -4,-4,-4,-4,-4,-4,-4,-4 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) +') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm new file mode 100644 index 0000000..e436034 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/sec_tabselect.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_sec_tabselect. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sec_tabselect) +include_mpn(`x86_64/fastsse/sec_tabselect.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm new file mode 100644 index 0000000..4ba673d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/bd1/sublsh1_n.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_sublsh1_n + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_sublsh1_n mpn_sublsh1_nc) +include_mpn(`x86_64/atom/sublsh1_n.asm') -- cgit v1.2.3