diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 |
commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86_64/atom | |
parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86_64/atom')
21 files changed, 3234 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm new file mode 100644 index 0000000..c1dcdc4 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/addmul_2.asm @@ -0,0 +1,186 @@ +dnl AMD64 mpn_addmul_2 optimised for Intel Atom. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 +C AMD K10 +C AMD bd1 +C AMD bd2 +C AMD bobcat +C AMD jaguar +C Intel P4 +C Intel PNR +C Intel NHM +C Intel SBR +C Intel IBR +C Intel HWL +C Intel BWL +C Intel atom 18.8 this +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + mov n_param, n + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov (up), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + lea -8(rp), rp + jmp L(lo0) + +L(b10): mov %rax, w2 + mov (up), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + lea -16(up), up + lea -24(rp), rp + jmp L(lo2) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up), %rax + xor R32(w1), R32(w1) + lea 8(up), up + dec n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov (up), %rax + mov %rdx, w2 + xor R32(w3), R32(w3) + lea -8(up), up + lea -16(rp), rp + jmp L(lo3) + + ALIGN(16) +L(top): +L(lo1): mul v1 + add w3, (rp) + mov $0, R32(w2) + adc %rax, w0 + mov (up), %rax + adc %rdx, w1 + mul v0 + add %rax, w0 + mov (up), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add w0, 8(rp) + adc %rax, w1 + mov 8(up), %rax + mov $0, R32(w3) + adc %rdx, w2 + mul v0 + add %rax, w1 + mov 8(up), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(lo3): mul v1 + add w1, 16(rp) + adc %rax, w2 + mov 16(up), %rax + mov $0, R32(w0) + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(lo2): mul v1 + add w2, 24(rp) + adc %rax, w3 + mov 24(up), %rax + adc %rdx, w0 + mov $0, R32(w1) + lea 32(rp), rp + mul v0 + lea 32(up), up + add %rax, w3 + adc %rdx, w0 + mov -8(up), %rax + adc $0, R32(w1) + sub $4, n + ja L(top) + +L(end): mul v1 + add w3, (rp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm new file mode 100644 index 0000000..f44de19 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/aorrlsh1_n.asm @@ -0,0 +1,238 @@ +dnl AMD64 mpn_addlsh1_n, mpn_rsblsh1_n optimised for Intel Atom. +dnl Used also for AMD bd1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * This code is slightly large at 433 bytes. +C * sublsh1_n.asm and this file use the same basic pattern. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 2.3 +C AMD bobcat ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.875 (4.75 is probably possible) +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ifdef(`OPERATION_addlsh1_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh1_n) + define(func_nc, mpn_addlsh1_nc)') +ifdef(`OPERATION_rsblsh1_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh1_n) + define(func_nc, mpn_rsblsh1_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_addlsh1_nc mpn_rsblsh1_n mpn_rsblsh1_nc) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbp + xor R32(%rbp), R32(%rbp) +L(ent): mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) + cmp $2, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r8 + add %r8, %r8 + lea 8(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 8(up), up + lea 8(rp), rp + jmp L(b0) + +L(b2): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + lea 16(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 16(up), up + lea 16(rp), rp + jmp L(b0) + +L(b3): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + lea 24(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 24(up), up + lea 24(rp), rp + +L(b0): test $4, R8(n) + jz L(skp) + add R32(%rax), R32(%rax) C restore scy + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + lea 32(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + mov %r11, 24(rp) + lea 32(up), up + lea 32(rp), rp + sbb R32(%rbp), R32(%rbp) C save acy + +L(skp): cmp $8, n + jl L(rtn) + + push %r12 + push %r13 + push %r14 + push %rbx + lea -64(rp), rp + jmp L(x) + + ALIGN(16) +L(top): add R32(%rax), R32(%rax) C restore scy + lea 64(rp), rp + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + mov 32(vp), %r12 + adc %r12, %r12 + mov 40(vp), %r13 + adc %r13, %r13 + mov 48(vp), %r14 + adc %r14, %r14 + mov 56(vp), %rbx + adc %rbx, %rbx + lea 64(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + ADCSBB (up), %r8 + mov %r8, (rp) + ADCSBB 8(up), %r9 + mov %r9, 8(rp) + ADCSBB 16(up), %r10 + mov %r10, 16(rp) + ADCSBB 24(up), %r11 + mov %r11, 24(rp) + ADCSBB 32(up), %r12 + mov %r12, 32(rp) + ADCSBB 40(up), %r13 + mov %r13, 40(rp) + ADCSBB 48(up), %r14 + mov %r14, 48(rp) + ADCSBB 56(up), %rbx + mov %rbx, 56(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 64(up), up +L(x): sub $8, n + jge L(top) + +L(end): pop %rbx + pop %r14 + pop %r13 + pop %r12 +L(rtn): +ifdef(`OPERATION_addlsh1_n',` + add R32(%rbp), R32(%rax) + neg R32(%rax)') +ifdef(`OPERATION_rsblsh1_n',` + sub R32(%rax), R32(%rbp) + movslq R32(%rbp), %rax') + + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + neg %r8 C set CF + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm new file mode 100644 index 0000000..02fb29d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/aorrlsh2_n.asm @@ -0,0 +1,191 @@ +dnl AMD64 mpn_addlsh2_n -- rp[] = up[] + (vp[] << 2) +dnl AMD64 mpn_rsblsh2_n -- rp[] = (vp[] << 2) - up[] +dnl Optimised for Intel Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5.75 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') + +define(`LSH', 2) +define(`RSH', 62) +define(M, eval(m4_lshift(1,LSH))) + +ifdef(`OPERATION_addlsh2_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_addlsh2_n) + define(func_nc, mpn_addlsh2_nc)') +ifdef(`OPERATION_rsblsh2_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsblsh2_n) + define(func_nc, mpn_rsblsh2_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) C we rely on rax = 0 at target + cmp $2, R32(%rax) + mov $0, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r9 + lea (%rax,%r9,M), %rbp + shr $RSH, %r9 + sub $1, n + lea -8(up), up + lea -8(rp), rp + jz L(cj1) + mov 8(vp), %r10 + lea (%r9,%r10,M), %r9 + shr $RSH, %r10 + mov 16(vp), %r11 + lea 24(vp), vp + mov (vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + jmp L(L1) + +L(b2): lea -32(rp), rp + mov (vp), %r8 + lea -32(up), up + lea (%rax,%r8,M), %rbx + shr $RSH, %r8 + mov 8(vp), %r9 + sub $2, n + jle L(end) + jmp L(top) + +L(b3): lea -24(up), up + mov (vp), %r11 + lea -24(rp), rp + mov 8(vp), %r8 + lea (%rax,%r11,M), %r10 + shr $RSH, %r11 + lea 8(vp), vp + lea (%r11,%r8,M), %rbx + add $1, n + jmp L(L3) + +L(b0): lea -16(up), up + mov (vp), %r10 + lea (%rax,%r10,M), %r9 + shr $RSH, %r10 + mov 8(vp), %r11 + lea -16(rp), rp + mov 16(vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + lea 16(vp), vp + jmp L(L0) + + ALIGN(16) +L(top): lea (%r8,%r9,M), %rbp + shr $RSH, %r9 + lea 32(up), up + mov 16(vp), %r10 + lea (%r9,%r10,M), %r9 + shr $RSH, %r10 + mov 24(vp), %r11 + lea 32(rp), rp + lea 32(vp), vp + mov (vp), %r8 + lea (%r10,%r11,M), %r10 + shr $RSH, %r11 + add R32(%rax), R32(%rax) + ADCSBB (up), %rbx + mov %rbx, (rp) +L(L1): ADCSBB 8(up), %rbp + mov %rbp, 8(rp) +L(L0): ADCSBB 16(up), %r9 + lea (%r11,%r8,M), %rbx + mov %r9, 16(rp) +L(L3): ADCSBB 24(up), %r10 + sbb R32(%rax), R32(%rax) +L(L2): shr $RSH, %r8 + mov 8(vp), %r9 + mov %r10, 24(rp) + sub $4, n + jg L(top) + +L(end): lea (%r8,%r9,M), %rbp + shr $RSH, %r9 + lea 32(up), up + lea 32(rp), rp + add R32(%rax), R32(%rax) + ADCSBB (up), %rbx + mov %rbx, (rp) +L(cj1): ADCSBB 8(up), %rbp + mov %rbp, 8(rp) + +ifdef(`OPERATION_addlsh2_n',` + mov R32(n), R32(%rax) C zero rax + adc %r9, %rax') +ifdef(`OPERATION_rsblsh2_n',` + sbb n, %r9 C subtract 0 + mov %r9, %rax') + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm new file mode 100644 index 0000000..83b8df9 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/aors_n.asm @@ -0,0 +1,128 @@ +dnl X86-64 mpn_add_n, mpn_sub_n, optimised for Intel Atom. + +dnl Copyright 2011, 2017 Free Software Foundation, Inc. + +dnl Contributed to the GNU project by Marco Bodrato. Ported to 64-bit by +dnl Torbjörn Granlund. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2 +C AMD K10 2 +C AMD bull 2.34\2.63 +C AMD pile 2.27\2.52 +C AMD steam +C AMD excavator +C AMD bobcat 2.79 +C AMD jaguar 2.78 +C Intel P4 11 +C Intel core2 7.5 +C Intel NHM 8.5 +C Intel SBR 2.11 +C Intel IBR 2.07 +C Intel HWL 1.75 +C Intel BWL 1.51 +C Intel SKL 1.52 +C Intel atom 3 +C Intel SLM 4 +C VIA nano + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`vp', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc) + +ifdef(`OPERATION_add_n', ` + define(ADCSBB, adc) + define(func_n, mpn_add_n) + define(func_nc, mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(ADCSBB, sbb) + define(func_n, mpn_sub_n) + define(func_nc, mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + xor cy, cy C carry + +L(com): shr n C n >> 1 + jz L(1) C n == 1 + jc L(1m2) C n % 2 == 1 + +L(0m2): shr cy + mov (up), %r10 + lea 8(up), up + lea 8(vp), vp + lea -8(rp), rp + jmp L(mid) + +L(1): shr cy + mov (up), %r9 + jmp L(end) + +L(1m2): shr cy + mov (up), %r9 + + ALIGN(16) +L(top): ADCSBB (vp), %r9 + lea 16(up), up + mov -8(up), %r10 + lea 16(vp), vp + mov %r9, (rp) +L(mid): ADCSBB -8(vp), %r10 + lea 16(rp), rp + dec n + mov (up), %r9 + mov %r10, -8(rp) + jnz L(top) + +L(end): ADCSBB (vp), %r9 + mov $0, R32(%rax) + mov %r9, (rp) + adc R32(%rax), R32(%rax) + FUNC_EXIT() + ret +EPILOGUE() + +PROLOGUE(func_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), cy ') + jmp L(com) +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm new file mode 100644 index 0000000..7cbc085 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/aorsmul_1.asm @@ -0,0 +1,194 @@ +dnl AMD64 mpn_addmul_1/mpn_submul_1 optimised for Intel Atom. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 4.5 +C AMD K10 4.5 +C AMD bull 4.73 +C AMD pile 4.60 4.80 +C AMD steam +C AMD excavator +C AMD bobcat 5.48 +C AMD jaguar 5.61 +C Intel P4 16.6 +C Intel core2 5.09 +C Intel NHM 4.79 +C Intel SBR 3.88 +C Intel IBR 3.65 +C Intel HWL 3.53 +C Intel BWL 2.75 +C Intel SKL 2.76 +C Intel atom 19.4 +C Intel SLM 8 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%rbx') + +ifdef(`OPERATION_addmul_1',` + define(`ADDSUB', `add') + define(`func', `mpn_addmul_1') +') +ifdef(`OPERATION_submul_1',` + define(`ADDSUB', `sub') + define(`func', `mpn_submul_1') +') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) + push %rbx + + mov (up), %rax + lea -8(up,n_param,8), up + lea -16(rp,n_param,8), rp + + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): test $2, R8(n_param) + jnz L(b10) + +L(b00): mov $1, R32(n) + sub n_param, n + mul v0 + mov %rax, %r11 + mov 8(up,n,8), %rax + mov %rdx, %r10 + mul v0 + mov %rax, %r8 + mov 16(up,n,8), %rax + jmp L(lo0) + +L(b10): mov $3, R32(n) + sub n_param, n + mul v0 + mov %rax, %r11 + mov -8(up,n,8), %rax + mov %rdx, %r10 + mul v0 + test n, n + jns L(cj2) + mov %rax, %r8 + mov (up,n,8), %rax + mov %rdx, %r9 + jmp L(lo2) + +L(bx1): test $2, R8(n_param) + jnz L(b11) + +L(b01): mov $2, R32(n) + sub n_param, n + mul v0 + test n, n + jns L(cj1) + mov %rax, %r8 + mov (up,n,8), %rax + mov %rdx, %r9 + mul v0 + mov %rax, %r11 + mov 8(up,n,8), %rax + mov %rdx, %r10 + jmp L(lo1) + +L(b11): xor R32(n), R32(n) + sub n_param, n + mul v0 + mov %rax, %r8 + mov 16(up,n,8), %rax + mov %rdx, %r9 + mul v0 + mov %rax, %r11 + mov 24(up,n,8), %rax + jmp L(lo3) + + ALIGN(16) +L(top): mul v0 + ADDSUB %r8, -16(rp,n,8) + mov %rax, %r8 + mov (up,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 +L(lo2): mul v0 + ADDSUB %r11, -8(rp,n,8) + mov %rax, %r11 + mov 8(up,n,8), %rax + adc %r10, %r8 + mov %rdx, %r10 + adc $0, %r9 +L(lo1): mul v0 + ADDSUB %r8, (rp,n,8) + mov %rax, %r8 + adc %r9, %r11 + mov 16(up,n,8), %rax + adc $0, %r10 +L(lo0): mov %rdx, %r9 + mul v0 + ADDSUB %r11, 8(rp,n,8) + mov %rax, %r11 + adc %r10, %r8 + mov 24(up,n,8), %rax + adc $0, %r9 +L(lo3): add $4, n + mov %rdx, %r10 + js L(top) + +L(end): mul v0 + ADDSUB %r8, -16(rp,n,8) + adc %r9, %r11 + adc $0, %r10 +L(cj2): ADDSUB %r11, -8(rp,n,8) + adc %r10, %rax + adc $0, %rdx +L(cj1): ADDSUB %rax, (rp,n,8) + mov $0, R32(%rax) + adc %rdx, %rax + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm new file mode 100644 index 0000000..fcb9a0f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/cnd_add_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_cnd_add_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_cnd_add_n) +include_mpn(`x86_64/coreisbr/cnd_add_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm new file mode 100644 index 0000000..9eee1c1 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/cnd_sub_n.asm @@ -0,0 +1,38 @@ +dnl X86-64 mpn_cnd_sub_n. + +dnl Copyright 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_cnd_sub_n) +include_mpn(`x86_64/coreisbr/cnd_sub_n.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/com.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/com.asm new file mode 100644 index 0000000..6b6460f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/com.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_com optimised for Intel Atom. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_com) +include_mpn(`x86_64/fastsse/com-palignr.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/copyd.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/copyd.asm new file mode 100644 index 0000000..e309279 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/copyd.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyd optimised for Intel Atom. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyd) +include_mpn(`x86_64/fastsse/copyd-palignr.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/copyi.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/copyi.asm new file mode 100644 index 0000000..00ec3c2 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/copyi.asm @@ -0,0 +1,37 @@ +dnl X86-64 mpn_copyi optimised for Intel Atom. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_copyi) +include_mpn(`x86_64/fastsse/copyi-palignr.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm new file mode 100644 index 0000000..d9ba5fe --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/dive_1.asm @@ -0,0 +1,37 @@ +dnl AMD64 mpn_divexact_1 -- mpn by limb exact division. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_divexact_1) +include_mpn(`x86_64/nano/dive_1.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h new file mode 100644 index 0000000..2cd90f6 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/gmp-mparam.h @@ -0,0 +1,222 @@ +/* Intel Atom/64 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#define SHLD_SLOW 1 +#define SHRD_SLOW 1 + +/* 1600 MHz Diamondville (Atom 330) */ +/* FFT tuning limit = 50,646,641 */ +/* Generated by tuneup.c, 2019-10-16, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 16 + +#define DIV_1_VS_MUL_1_PERCENT 201 + +#define MUL_TOOM22_THRESHOLD 12 +#define MUL_TOOM33_THRESHOLD 74 +#define MUL_TOOM44_THRESHOLD 106 +#define MUL_TOOM6H_THRESHOLD 155 +#define MUL_TOOM8H_THRESHOLD 212 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 77 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 72 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 58 + +#define SQR_BASECASE_THRESHOLD 5 +#define SQR_TOOM2_THRESHOLD 22 +#define SQR_TOOM3_THRESHOLD 73 +#define SQR_TOOM4_THRESHOLD 130 +#define SQR_TOOM6_THRESHOLD 159 +#define SQR_TOOM8_THRESHOLD 236 + +#define MULMID_TOOM42_THRESHOLD 16 + +#define MULMOD_BNM1_THRESHOLD 9 +#define SQRMOD_BNM1_THRESHOLD 9 + +#define MUL_FFT_MODF_THRESHOLD 220 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 220, 5}, { 11, 6}, { 6, 5}, { 13, 6}, \ + { 13, 7}, { 7, 6}, { 15, 7}, { 8, 6}, \ + { 17, 7}, { 13, 8}, { 7, 7}, { 17, 8}, \ + { 9, 7}, { 19, 8}, { 11, 7}, { 23, 8}, \ + { 13, 9}, { 7, 8}, { 19, 9}, { 11, 8}, \ + { 25,10}, { 7, 9}, { 15, 8}, { 33, 9}, \ + { 19, 8}, { 39, 9}, { 23, 8}, { 47, 9}, \ + { 27,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 67,10}, \ + { 39, 9}, { 79,10}, { 47, 9}, { 95,11}, \ + { 31,10}, { 63, 9}, { 127, 8}, { 255,10}, \ + { 71, 9}, { 143, 8}, { 287,10}, { 79,11}, \ + { 47,10}, { 95, 9}, { 191,12}, { 31,11}, \ + { 63,10}, { 127, 9}, { 255, 8}, { 511,10}, \ + { 143, 9}, { 287,11}, { 79,10}, { 159, 9}, \ + { 319,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,10}, { 207,11}, { 111,10}, \ + { 223,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319,11}, { 175,10}, { 351,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,11}, { 223,13}, { 63,12}, { 127,11}, \ + { 255,10}, { 511,11}, { 287,10}, { 575,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 767,12}, { 223,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 511,12}, \ + { 287,11}, { 575,12}, { 319,11}, { 639,12}, \ + { 351,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 447,14}, { 127,13}, { 255,12}, { 575,13}, \ + { 319,12}, { 703,13}, { 383,12}, { 767,13}, \ + { 447,14}, { 255,13}, { 511,12}, { 1023,13}, \ + { 575,12}, { 1151,13}, { 703,14}, { 383,13}, \ + { 831,12}, { 1663,15}, { 255,14}, { 511,13}, \ + { 1087,12}, { 2175,13}, { 1151,14}, { 639,13}, \ + { 1407,12}, { 2815,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2175,14}, { 1151,13}, { 2431,12}, { 4863,14}, \ + { 1407,13}, { 2815,15}, { 767,14}, { 1791,16}, \ + { 511,15}, { 1023,14}, { 2431,13}, { 4863,15}, \ + { 1279,14}, { 2943,15}, { 1535,14}, { 16384,15}, \ + { 32768,16}, { 65536,17}, { 131072,18}, { 262144,19}, \ + { 524288,20}, {1048576,21}, {2097152,22}, {4194304,23}, \ + {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 169 +#define MUL_FFT_THRESHOLD 2240 + +#define SQR_FFT_MODF_THRESHOLD 184 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 184, 5}, { 11, 6}, { 13, 7}, { 7, 6}, \ + { 15, 7}, { 8, 6}, { 17, 7}, { 13, 8}, \ + { 7, 7}, { 17, 8}, { 9, 7}, { 19, 8}, \ + { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ + { 19, 9}, { 11, 8}, { 25,10}, { 7, 9}, \ + { 15, 8}, { 33, 9}, { 19, 8}, { 39, 9}, \ + { 23,10}, { 15, 9}, { 39,10}, { 23, 9}, \ + { 47,11}, { 15,10}, { 31, 9}, { 63, 8}, \ + { 127, 7}, { 255,10}, { 39, 8}, { 159,10}, \ + { 47, 9}, { 95, 8}, { 191,11}, { 31,10}, \ + { 63, 9}, { 127, 8}, { 255, 7}, { 511,10}, \ + { 71, 9}, { 143, 8}, { 287, 7}, { 575, 9}, \ + { 159, 8}, { 319,11}, { 47,10}, { 95, 9}, \ + { 191, 8}, { 383,12}, { 31,11}, { 63,10}, \ + { 127, 9}, { 255, 8}, { 511,10}, { 143, 9}, \ + { 287, 8}, { 575,10}, { 159, 9}, { 319, 8}, \ + { 639,10}, { 175, 9}, { 351,11}, { 95,10}, \ + { 191, 9}, { 383,11}, { 111,10}, { 223, 9}, \ + { 447,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,11}, { 143,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,11}, { 175,10}, \ + { 351,12}, { 95,11}, { 191,10}, { 383, 9}, \ + { 767,11}, { 223,10}, { 447,13}, { 63,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 287,10}, \ + { 575,12}, { 159,11}, { 319,10}, { 639,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 767,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 511,12}, { 287,11}, { 575,12}, { 319,11}, \ + { 639,12}, { 351,13}, { 191,12}, { 383,11}, \ + { 767,12}, { 447,14}, { 127,13}, { 255,12}, \ + { 575,13}, { 319,12}, { 703,13}, { 383,12}, \ + { 767,13}, { 447,14}, { 255,13}, { 511,12}, \ + { 1023,13}, { 575,12}, { 1151,13}, { 703,14}, \ + { 383,13}, { 831,12}, { 1663,15}, { 255,14}, \ + { 511,13}, { 1151,14}, { 639,13}, { 1407,12}, \ + { 2815,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2047,14}, \ + { 1151,13}, { 2431,12}, { 4863,14}, { 1407,13}, \ + { 2815,15}, { 767,14}, { 1791,16}, { 511,15}, \ + { 1023,14}, { 2431,13}, { 4863,15}, { 1279,14}, \ + { 2943,15}, { 1535,14}, { 16384,15}, { 32768,16}, \ + { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ + {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 172 +#define SQR_FFT_THRESHOLD 1728 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 33 +#define MULLO_MUL_N_THRESHOLD 4392 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 85 +#define SQRLO_SQR_THRESHOLD 3176 + +#define DC_DIV_QR_THRESHOLD 34 +#define DC_DIVAPPR_Q_THRESHOLD 119 +#define DC_BDIV_QR_THRESHOLD 31 +#define DC_BDIV_Q_THRESHOLD 76 + +#define INV_MULMOD_BNM1_THRESHOLD 22 +#define INV_NEWTON_THRESHOLD 149 +#define INV_APPR_THRESHOLD 123 + +#define BINV_NEWTON_THRESHOLD 179 +#define REDC_1_TO_REDC_2_THRESHOLD 24 +#define REDC_2_TO_REDC_N_THRESHOLD 39 + +#define MU_DIV_QR_THRESHOLD 807 +#define MU_DIVAPPR_Q_THRESHOLD 807 +#define MUPI_DIV_QR_THRESHOLD 77 +#define MU_BDIV_QR_THRESHOLD 748 +#define MU_BDIV_Q_THRESHOLD 807 + +#define POWM_SEC_TABLE 1,22,114,326,1486 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 30 +#define SET_STR_DC_THRESHOLD 381 +#define SET_STR_PRECOMPUTE_THRESHOLD 1565 + +#define FAC_DSC_THRESHOLD 960 +#define FAC_ODD_THRESHOLD 0 /* always */ + +#define MATRIX22_STRASSEN_THRESHOLD 13 +#define HGCD2_DIV1_METHOD 3 /* 5.86% faster than 4 */ +#define HGCD_THRESHOLD 88 +#define HGCD_APPR_THRESHOLD 88 +#define HGCD_REDUCE_THRESHOLD 1182 +#define GCD_DC_THRESHOLD 241 +#define GCDEXT_DC_THRESHOLD 192 +#define JACOBI_BASE_METHOD 3 /* 9.43% faster than 2 */ + +/* Tuneup completed successfully, took 193098 seconds */ diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/lshift.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/lshift.asm new file mode 100644 index 0000000..1b37d5d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/lshift.asm @@ -0,0 +1,123 @@ +dnl AMD64 mpn_lshift -- mpn left shift, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times +C larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + lea -8(up,n,8), up + lea -8(rp,n,8), rp + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shl R8(%rcx), %r11 + neg R8(%rcx) + shr R8(%rcx), %rax + test n, n + jnz L(gt1) + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov -8(up), %r8 + mov %r8, %r10 + shr R8(%rcx), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(%rcx) + shr R8(%rcx), %rax + mov -8(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + neg R8(%rcx) + dec n + lea 8(rp), rp + lea -8(up), up + jz L(end) + + ALIGN(8) +L(top): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + neg R8(%rcx) + mov -8(up), %r8 + mov %r8, %r10 + mov %r9, -8(rp) + shr R8(%rcx), %r8 + lea -16(rp), rp +L(lo1): mov -16(up), %r9 + or %r11, %r8 + mov %r9, %r11 + shr R8(%rcx), %r9 + lea -16(up), up + neg R8(%rcx) + mov %r8, (rp) + dec n + jg L(top) + +L(end): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + mov %r9, -8(rp) + mov %r11, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm new file mode 100644 index 0000000..7385f8f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/lshiftc.asm @@ -0,0 +1,127 @@ +dnl AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4.5 c/l, but the code is 2.5 +C times larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_lshiftc) + FUNC_ENTRY(4) + lea -8(up,n,8), up + lea -8(rp,n,8), rp + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shl R8(%rcx), %r11 + neg R8(%rcx) + shr R8(%rcx), %rax + test n, n + jnz L(gt1) + not %r11 + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov -8(up), %r8 + mov %r8, %r10 + shr R8(%rcx), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(%rcx) + shr R8(%rcx), %rax + mov -8(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + neg R8(%rcx) + lea 8(rp), rp + lea -8(up), up + jmp L(lo0) + +C ALIGN(16) +L(top): shl R8(%rcx), %r10 + or %r10, %r9 + shl R8(%rcx), %r11 + not %r9 + neg R8(%rcx) + mov -8(up), %r8 + lea -16(rp), rp + mov %r8, %r10 + shr R8(%rcx), %r8 + mov %r9, 8(rp) +L(lo1): or %r11, %r8 + mov -16(up), %r9 + mov %r9, %r11 + shr R8(%rcx), %r9 + lea -16(up), up + neg R8(%rcx) + not %r8 + mov %r8, (rp) +L(lo0): dec n + jg L(top) + +L(end): shl R8(%rcx), %r10 + or %r10, %r9 + not %r9 + shl R8(%rcx), %r11 + not %r11 + mov %r9, -8(rp) + mov %r11, -16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm new file mode 100644 index 0000000..a0dcf1e --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/mul_1.asm @@ -0,0 +1,147 @@ +dnl AMD64 mpn_mul_1 optimised for Intel Atom. + +dnl Copyright 2003-2005, 2007, 2008, 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 3.03 +C AMD K10 3.03 +C AMD bull 4.74 +C AMD pile 4.56 +C AMD steam +C AMD excavator +C AMD bobcat 5.56 6.04 +C AMD jaguar 5.55 5.84 +C Intel P4 13.05 +C Intel core2 4.03 +C Intel NHM 3.80 +C Intel SBR 2.75 +C Intel IBR 2.69 +C Intel HWL 2.50 +C Intel BWL 2.55 +C Intel SKL 2.57 +C Intel atom 17.3 +C Intel SLM 14.7 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`v0', `%rcx') C r9 + +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_1) + FUNC_ENTRY(4) + xor %r8, %r8 +L(com): mov (up), %rax + lea -16(up,n_param,8), up + lea -8(rp,n_param,8), rp + test $1, R8(n_param) + jnz L(bx1) + +L(bx0): mov %r8, %r9 + test $2, R8(n_param) + jnz L(b10) + +L(b00): mov $2, R32(n) + sub n_param, n + jmp L(lo0) + +L(bx1): test $2, R8(n_param) + jnz L(b11) + +L(b01): mov $3, R32(n) + sub n_param, n + mul v0 + cmp $2, n + jnz L(lo1) + jmp L(cj1) + +L(b11): mov $1, R32(n) + sub n_param, n + jmp L(lo3) + +L(b10): xor R32(n), R32(n) + sub n_param, n + jmp L(lo2) + +L(top): mul v0 + mov %r9, -24(rp,n,8) +L(lo1): xor %r9d, %r9d + add %rax, %r8 + mov (up,n,8), %rax + adc %rdx, %r9 + mov %r8, -16(rp,n,8) +L(lo0): xor %r8d, %r8d + mul v0 + add %rax, %r9 + mov 8(up,n,8), %rax + adc %rdx, %r8 + mov %r9, -8(rp,n,8) +L(lo3): xor %r9d, %r9d + mul v0 + add %rax, %r8 + mov 16(up,n,8), %rax + adc %rdx, %r9 + mov %r8, (rp,n,8) +L(lo2): xor %r8d, %r8d + mul v0 + add %rax, %r9 + mov 24(up,n,8), %rax + adc %rdx, %r8 + add $4, n + js L(top) + +L(end): mul v0 + mov %r9, -8(rp) +L(cj1): add %rax, %r8 + mov $0, R32(%rax) + adc %rdx, %rax + mov %r8, (rp) + FUNC_EXIT() + ret +EPILOGUE() + +PROLOGUE(mpn_mul_1c) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + jmp L(com) +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm new file mode 100644 index 0000000..4bc22cd --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/mul_2.asm @@ -0,0 +1,190 @@ +dnl AMD64 mpn_mul_2 optimised for Intel Atom. + +dnl Copyright 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C AMD K8,K9 5.78 +C AMD K10 5.78 +C AMD bull 9.10 +C AMD pile 9.17 +C AMD steam +C AMD excavator +C AMD bobcat 11.3 +C AMD jaguar 10.9 +C Intel P4 24.6 +C Intel core2 8.06 +C Intel NHM 7.65 +C Intel SBR 6.28 +C Intel IBR 6.10 +C Intel HWL 6.09 +C Intel BWL 4.73 +C Intel SKL 4.77 +C Intel atom 35.3 +C Intel SLM 25.6 +C VIA nano + +C The loop of this code is the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`n_param', `%rdx') C r8 +define(`vp', `%rcx') C r9 + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_2) + FUNC_ENTRY(4) + push %rbx + push %rbp + + mov (up), %rax + + mov (vp), v0 + mov 8(vp), v1 + + mov n_param, n + mul v0 + + test $1, R8(n) + jnz L(bx1) + +L(bx0): test $2, R8(n) + jnz L(b10) + +L(b00): mov %rax, w0 + mov (up), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + lea -8(rp), rp + jmp L(lo0) + +L(b10): mov %rax, w2 + mov (up), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + lea -16(up), up + lea -24(rp), rp + jmp L(lo2) + +L(bx1): test $2, R8(n) + jnz L(b11) + +L(b01): mov %rax, w3 + mov %rdx, w0 + mov (up), %rax + xor R32(w1), R32(w1) + lea 8(up), up + dec n + jmp L(lo1) + +L(b11): mov %rax, w1 + mov (up), %rax + mov %rdx, w2 + xor R32(w3), R32(w3) + lea -8(up), up + lea -16(rp), rp + jmp L(lo3) + + ALIGN(16) +L(top): +L(lo1): mul v1 + add %rax, w0 + mov (up), %rax + mov $0, R32(w2) + mov w3, (rp) + adc %rdx, w1 + mul v0 + add %rax, w0 + mov (up), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add %rax, w1 + mov 8(up), %rax + mov w0, 8(rp) + adc %rdx, w2 + mul v0 + add %rax, w1 + mov 8(up), %rax + adc %rdx, w2 + mov $0, R32(w3) + adc $0, R32(w3) +L(lo3): mul v1 + add %rax, w2 + mov 16(up), %rax + mov w1, 16(rp) + mov $0, R32(w0) + adc %rdx, w3 + mul v0 + add %rax, w2 + mov 16(up), %rax + adc %rdx, w3 +L(lo2): mov $0, R32(w1) + mov w2, 24(rp) + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov 24(up), %rax + lea 32(up), up + adc %rdx, w0 + mul v0 + lea 32(rp), rp + add %rax, w3 + adc %rdx, w0 + mov -8(up), %rax + adc $0, R32(w1) + sub $4, n + ja L(top) + +L(end): mul v1 + mov w3, (rp) + add %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/popcount.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/popcount.asm new file mode 100644 index 0000000..fb14dd3 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/popcount.asm @@ -0,0 +1,35 @@ +dnl x86-64 mpn_popcount. + +dnl Copyright 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_popcount) +include_mpn(`x86/pentium4/sse2/popcount.asm') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm new file mode 100644 index 0000000..62b9a84 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/redc_1.asm @@ -0,0 +1,579 @@ +dnl X86-64 mpn_redc_1 optimised for Intel Atom. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat 5.0 +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Micro-optimise, none performed thus far. +C * Consider inlining mpn_add_n. +C * Single basecases out before the pushes. +C * Make lead-in code for the inner loops be more similar. + +C When playing with pointers, set this to $2 to fall back to conservative +C indexing in wind-down code. +define(`I',`$1') + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r14') +define(`j', `%r15') +define(`mp', `%r12') +define(`q0', `%r13') +define(`w0', `%rbp') +define(`w1', `%r9') +define(`w2', `%r10') +define(`w3', `%r11') + +C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +define(`ALIGNx', `ALIGN(16)') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), q0 + mov n, j C outer loop induction var + lea (mp_param,n,8), mp + lea (up,n,8), up + neg n + imul u0inv, q0 C first iteration q0 + + test $1, R8(n) + jz L(bx0) + +L(bx1): test $2, R8(n) + jz L(b3) + +L(b1): cmp $-1, R32(n) + jz L(n1) + +L(otp1):lea 1(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %rbp + mov 8(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + add (up,n,8), %rbp + mov %rax, %rbp + adc %r9, %rbx + mov 24(mp,n,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %r11 + adc %r10, %rbp + mov 32(mp,n,8), %rax + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e1) + + ALIGNx +L(tp1): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 +L(e1): add $4, i + mov %rdx, %r10 + js L(tp1) + +L(ed1): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp1) + jmp L(cj) + +L(b3): cmp $-3, R32(n) + jz L(n3) + +L(otp3):lea 3(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %rbp + mov 8(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + add (up,n,8), %rbp + mov %rax, %rbp + mov 24(mp,n,8), %rax + adc %r9, %rbx + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %r11 + mov 32(mp,n,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + imul u0inv, %rbx C next q limb + jmp L(e3) + + ALIGNx +L(tp3): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 +L(e3): mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp3) + +L(ed3): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp3) +C jmp L(cj) + +L(cj): +IFSTD(` lea (up,n,8), up C param 2: up + lea (up,n,8), %rdx C param 3: up - n + neg R32(n) ') C param 4: n + +IFDOS(` lea (up,n,8), %rdx C param 2: up + lea (%rdx,n,8), %r8 C param 3: up - n + neg R32(n) + mov n, %r9 C param 4: n + mov rp, %rcx ') C param 1: rp + +IFSTD(` sub $8, %rsp ') +IFDOS(` sub $40, %rsp ') + ASSERT(nz, `test $15, %rsp') + CALL( mpn_add_n) +IFSTD(` add $8, %rsp ') +IFDOS(` add $40, %rsp ') + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +L(bx0): test $2, R8(n) + jnz L(b2) + +L(b0): cmp $-4, R32(n) + jz L(n4) + +L(otp0):lea 4(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %r11 + mov 8(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + add (up,n,8), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov 24(mp,n,8), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + imul u0inv, %rbx C next q limb + jmp L(e0) + + ALIGNx +L(tp0): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 +L(e0): mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 + mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp0) + +L(ed0): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp0) + jmp L(cj) + +L(b2): cmp $-2, R32(n) + jz L(n2) + +L(otp2):lea 2(n), i + mov (mp,n,8), %rax + mul q0 + mov %rax, %r11 + mov 8(mp,n,8), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov 16(mp,n,8), %rax + mov %rdx, %r9 + mul q0 + add (up,n,8), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov 24(mp,n,8), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add 8(up,n,8), %rbx + mov %rbx, 8(up,n,8) + mov %rax, %rbp + mov 32(mp,n,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + imul u0inv, %rbx C next q limb + jmp L(e2) + + ALIGNx +L(tp2): mul q0 + add %rbp, -24(up,i,8) + mov %rax, %rbp + mov (mp,i,8), %rax + adc %r9, %r11 + mov %rdx, %r9 + adc $0, %r10 + mul q0 + add %r11, -16(up,i,8) + mov %rax, %r11 + mov 8(mp,i,8), %rax + adc %r10, %rbp + mov %rdx, %r10 + adc $0, %r9 + mul q0 + add %rbp, -8(up,i,8) + mov %rax, %rbp + adc %r9, %r11 + mov 16(mp,i,8), %rax + adc $0, %r10 + mov %rdx, %r9 +L(e2): mul q0 + add %r11, (up,i,8) + mov %rax, %r11 + adc %r10, %rbp + mov 24(mp,i,8), %rax + adc $0, %r9 + add $4, i + mov %rdx, %r10 + js L(tp2) + +L(ed2): mul q0 + add %rbp, I(-24(up),-24(up,i,8)) + adc %r9, %r11 + adc $0, %r10 + add %r11, I(-16(up),-16(up,i,8)) + adc %r10, %rax + adc $0, %rdx + add %rax, I(-8(up),-8(up,i,8)) + adc $0, %rdx + mov %rdx, (up,n,8) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + lea 8(up), up C up++ + dec j + jnz L(otp2) + jmp L(cj) + +L(n1): mov (mp_param), %rax + mul q0 + add -8(up), %rax + adc (up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n2): mov (mp_param), %rax + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov -8(up), %r10 + mul q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + add %r9, %r10 + adc $0, %r11 + mov %r10, q0 + imul u0inv, q0 C next q0 + mov -16(mp), %rax + mul q0 + add %rax, %r10 + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + mov (up), %r14 + mul q0 + add %rax, %r14 + adc $0, %rdx + add %r9, %r14 + adc $0, %rdx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 8(up), %rdx + mov %r14, (rp) + mov %rdx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + ALIGNx +L(n3): mov -24(mp), %rax + mov -24(up), %r10 + mul q0 + add %rax, %r10 + mov -16(mp), %rax + mov %rdx, %r11 + adc $0, %r11 + mov -16(up), %rbp + mul q0 + add %rax, %rbp + mov %rdx, %r9 + adc $0, %r9 + mov -8(mp), %rax + add %r11, %rbp + mov -8(up), %r10 + adc $0, %r9 + mul q0 + mov %rbp, q0 + imul u0inv, q0 C next q0 + add %rax, %r10 + mov %rdx, %r11 + adc $0, %r11 + mov %rbp, -16(up) + add %r9, %r10 + adc $0, %r11 + mov %r10, -8(up) + mov %r11, -24(up) C up[0] + lea 8(up), up C up++ + dec j + jnz L(n3) + + mov -48(up), %rdx + mov -40(up), %rbx + xor R32(%rax), R32(%rax) + add %rbp, %rdx + adc %r10, %rbx + adc -8(up), %r11 + mov %rdx, (rp) + mov %rbx, 8(rp) + mov %r11, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + +L(n4): mov -32(mp), %rax + mul q0 + mov %rax, %r11 + mov -24(mp), %rax + mov %rdx, %r10 + mul q0 + mov %rax, %rbx + mov -16(mp), %rax + mov %rdx, %r9 + mul q0 + add -32(up), %r11 + mov %rax, %r11 + adc %r10, %rbx + mov -8(mp), %rax + adc $0, %r9 + mov %rdx, %r10 + mul q0 + add -24(up), %rbx + mov %rbx, -24(up) + adc %r9, %r11 + adc $0, %r10 + imul u0inv, %rbx C next q limb + add %r11, -16(up) + adc %r10, %rax + adc $0, %rdx + add %rax, -8(up) + adc $0, %rdx + mov %rdx, -32(up) C up[0] + mov %rbx, q0 C previously computed q limb -> q0 + dec j + lea 8(up), up C up++ + jnz L(n4) + jmp L(cj) +EPILOGUE() +ASM_END() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm new file mode 100644 index 0000000..6f5f638 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/rsh1aors_n.asm @@ -0,0 +1,287 @@ +dnl x86-64 mpn_rsh1add_n/mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * Schedule loop less. It is now almost surely overscheduled, resulting in +C large feed-in and wind-down code. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NMH ? +C Intel SBR ? +C Intel atom 5.25 +C VIA nano ? + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`vp',`%rdx') +define(`n',`%rcx') + +ifdef(`OPERATION_rsh1add_n', ` + define(ADDSUB, add) + define(ADCSBB, adc) + define(func_n, mpn_rsh1add_n) + define(func_nc, mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(ADDSUB, sub) + define(ADCSBB, sbb) + define(func_n, mpn_rsh1sub_n) + define(func_nc, mpn_rsh1sub_nc)') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func_n) + FUNC_ENTRY(4) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov (up), %r15 + ADDSUB (vp), %r15 + sbb R32(%rbx), R32(%rbx) + xor R32(%rax), R32(%rax) + shr %r15 + adc R32(%rax), R32(%rax) C return value + + mov R32(n), R32(%rbp) + and $3, R32(%rbp) + jz L(b0) + cmp $2, R32(%rbp) + jae L(b23) + +L(b1): dec n + jnz L(gt1) + shl $63, %rbx + add %rbx, %r15 + mov %r15, (rp) + jmp L(cj1) +L(gt1): lea 24(up), up + lea 24(vp), vp + mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + lea 24(rp), rp + mov (up), %r11 + ADCSBB -16(vp), %r9 + ADCSBB -8(vp), %r10 + mov %r15, %r12 + ADCSBB (vp), %r11 + mov %r9, %r13 + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + mov %r10, %r14 + shl $63, %r11 + shl $63, %r10 + shl $63, %r9 + or %r9, %r12 + shr %r13 + mov 8(up), %r8 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + sub $4, n + jz L(cj5) +L(gt5): mov 16(up), %r9 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov 32(up), %r11 + jmp L(lo1) + +L(b23): jnz L(b3) + mov 8(up), %r8 + sub $2, n + jnz L(gt2) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + mov %r8, %r12 + jmp L(cj2) +L(gt2): mov 16(up), %r9 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov 32(up), %r11 + ADCSBB 16(vp), %r9 + lea 32(up), up + ADCSBB 24(vp), %r10 + mov %r9, %r13 + ADCSBB 32(vp), %r11 + mov %r8, %r12 + jmp L(lo2) + +L(b3): lea 40(up), up + lea 8(vp), vp + mov %r15, %r14 + add R32(%rbx), R32(%rbx) + mov -32(up), %r11 + ADCSBB 0(vp), %r11 + lea 8(rp), rp + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + shl $63, %r11 + mov -24(up), %r8 + shr %r15 + or %r11, %r14 + sub $3, n + jnz L(gt3) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + jmp L(cj3) +L(gt3): mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov (up), %r11 + ADCSBB 16(vp), %r9 + ADCSBB 24(vp), %r10 + mov %r8, %r12 + jmp L(lo3) + +L(b0): lea 48(up), up + lea 16(vp), vp + add R32(%rbx), R32(%rbx) + mov -40(up), %r10 + lea 16(rp), rp + mov -32(up), %r11 + ADCSBB -8(vp), %r10 + mov %r15, %r13 + ADCSBB (vp), %r11 + sbb R32(%rbx), R32(%rbx) + mov %r11, %r15 + mov %r10, %r14 + shl $63, %r11 + shl $63, %r10 + mov -24(up), %r8 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + sub $4, n + jnz L(gt4) + add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + jmp L(cj4) +L(gt4): mov -16(up), %r9 + add R32(%rbx), R32(%rbx) + mov -8(up), %r10 + ADCSBB 8(vp), %r8 + mov %r15, %rbp + mov (up), %r11 + ADCSBB 16(vp), %r9 + jmp L(lo0) + + ALIGN(8) +L(top): mov 16(up), %r9 + shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + add R32(%rbx), R32(%rbx) + mov 24(up), %r10 + mov %rbp, (rp) + ADCSBB 8(vp), %r8 + mov %r15, %rbp + lea 32(rp), rp + mov 32(up), %r11 +L(lo1): ADCSBB 16(vp), %r9 + lea 32(up), up + mov %r12, -24(rp) +L(lo0): ADCSBB 24(vp), %r10 + mov %r8, %r12 + mov %r13, -16(rp) +L(lo3): ADCSBB 32(vp), %r11 + mov %r9, %r13 + mov %r14, -8(rp) +L(lo2): sbb R32(%rbx), R32(%rbx) + shl $63, %r8 + mov %r11, %r15 + shr %r12 + mov %r10, %r14 + shl $63, %r9 + lea 32(vp), vp + shl $63, %r10 + or %r8, %rbp + shl $63, %r11 + or %r9, %r12 + shr %r13 + mov 8(up), %r8 + sub $4, n + jg L(top) + +L(end): shr %r14 + or %r10, %r13 + shr %r15 + or %r11, %r14 + mov %rbp, (rp) + lea 32(rp), rp +L(cj5): add R32(%rbx), R32(%rbx) + ADCSBB 8(vp), %r8 + mov %r12, -24(rp) +L(cj4): mov %r13, -16(rp) +L(cj3): mov %r8, %r12 + mov %r14, -8(rp) +L(cj2): sbb R32(%rbx), R32(%rbx) + shl $63, %r8 + shr %r12 + or %r8, %r15 + shl $63, %rbx + add %rbx, %r12 + mov %r15, (rp) + mov %r12, 8(rp) +L(cj1): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/rshift.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/rshift.asm new file mode 100644 index 0000000..29c027d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/rshift.asm @@ -0,0 +1,121 @@ +dnl AMD64 mpn_rshift -- mpn right shift, optimised for Atom. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 4.5 +C VIA nano ? + +C TODO +C * Consider using 4-way unrolling. We reach 4 c/l, but the code is 2.5 times +C larger. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_rshift) + FUNC_ENTRY(4) + shr R32(n) + mov (up), %rax + jnc L(evn) + + mov %rax, %r11 + shr R8(cnt), %r11 + neg R8(cnt) + shl R8(cnt), %rax + test n, n + jnz L(gt1) + mov %r11, (rp) + FUNC_EXIT() + ret + +L(gt1): mov 8(up), %r8 + mov %r8, %r10 + shl R8(cnt), %r8 + jmp L(lo1) + +L(evn): mov %rax, %r10 + neg R8(cnt) + shl R8(cnt), %rax + mov 8(up), %r9 + mov %r9, %r11 + shl R8(cnt), %r9 + neg R8(cnt) + dec n + lea -8(rp), rp + lea 8(up), up + jz L(end) + + ALIGN(8) +L(top): shr R8(cnt), %r10 + or %r10, %r9 + shr R8(cnt), %r11 + neg R8(cnt) + mov 8(up), %r8 + mov %r8, %r10 + mov %r9, 8(rp) + shl R8(cnt), %r8 + lea 16(rp), rp +L(lo1): mov 16(up), %r9 + or %r11, %r8 + mov %r9, %r11 + shl R8(cnt), %r9 + lea 16(up), up + neg R8(cnt) + mov %r8, (rp) + dec n + jg L(top) + +L(end): shr R8(cnt), %r10 + or %r10, %r9 + shr R8(cnt), %r11 + mov %r9, 8(rp) + mov %r11, 16(rp) + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm new file mode 100644 index 0000000..1306acd --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/atom/sublsh1_n.asm @@ -0,0 +1,242 @@ +dnl AMD64 mpn_sublsh1_n optimised for Intel Atom. +dnl Used also for AMD bd1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO +C * This code is slightly large at 501 bytes. +C * aorrlsh1_n.asm and this file use the same basic pattern. + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bd1 2.3 +C AMD bobcat ? +C Intel P4 ? +C Intel core2 ? +C Intel NHM ? +C Intel SBR ? +C Intel atom 5 (4.875 is probably possible) +C VIA nano ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cy', `%r8') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sublsh1_n) + FUNC_ENTRY(4) + push %rbp + push %r15 + xor R32(%rbp), R32(%rbp) +L(ent): mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) + cmp $2, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): mov (vp), %r8 + add %r8, %r8 + lea 8(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 8(up), up + lea 8(rp), rp + jmp L(b0) + +L(b2): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + lea 16(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 16(up), up + lea 16(rp), rp + jmp L(b0) + +L(b3): mov (vp), %r8 + add %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + lea 24(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + mov 16(up), %r15 + sbb %r10, %r15 + mov %r15, 16(rp) + sbb R32(%rbp), R32(%rbp) C save acy + lea 24(up), up + lea 24(rp), rp + +L(b0): test $4, R8(n) + jz L(skp) + add R32(%rax), R32(%rax) C restore scy + mov (vp), %r8 + adc %r8, %r8 + mov 8(vp), %r9 + adc %r9, %r9 + mov 16(vp), %r10 + adc %r10, %r10 + mov 24(vp), %r11 + adc %r11, %r11 + lea 32(vp), vp + sbb R32(%rax), R32(%rax) C save scy + add R32(%rbp), R32(%rbp) C restore acy + mov (up), %r15 + sbb %r8, %r15 + mov %r15, (rp) + mov 8(up), %r15 + sbb %r9, %r15 + mov %r15, 8(rp) + mov 16(up), %r15 + sbb %r10, %r15 + mov %r15, 16(rp) + mov 24(up), %r15 + sbb %r11, %r15 + mov %r15, 24(rp) + lea 32(up), up + lea 32(rp), rp + sbb R32(%rbp), R32(%rbp) C save acy + +L(skp): cmp $8, n + jl L(rtn) + + push %r12 + push %r13 + push %r14 + push %rbx + lea -64(rp), rp + jmp L(x) + + ALIGN(16) +L(top): mov (vp), %r8 + add R32(%rax), R32(%rax) + lea 64(vp), vp + adc %r8, %r8 + mov -56(vp), %r9 + adc %r9, %r9 + mov -48(vp), %r10 + adc %r10, %r10 + mov -40(vp), %r11 + adc %r11, %r11 + mov -32(vp), %r12 + adc %r12, %r12 + mov -24(vp), %r13 + adc %r13, %r13 + mov -16(vp), %r14 + adc %r14, %r14 + mov -8(vp), %r15 + adc %r15, %r15 + sbb R32(%rax), R32(%rax) + add R32(%rbp), R32(%rbp) + mov (up), %rbp + lea 64(rp), rp + mov 8(up), %rbx + sbb %r8, %rbp + mov 32(up), %r8 + mov %rbp, (rp) + sbb %r9, %rbx + mov 16(up), %rbp + mov %rbx, 8(rp) + sbb %r10, %rbp + mov 24(up), %rbx + mov %rbp, 16(rp) + sbb %r11, %rbx + mov %rbx, 24(rp) + sbb %r12, %r8 + mov 40(up), %r9 + mov %r8, 32(rp) + sbb %r13, %r9 + mov 48(up), %rbp + mov %r9, 40(rp) + sbb %r14, %rbp + mov 56(up), %rbx + mov %rbp, 48(rp) + sbb %r15, %rbx + lea 64(up), up + mov %rbx, 56(rp) + sbb R32(%rbp), R32(%rbp) +L(x): sub $8, n + jge L(top) + +L(end): pop %rbx + pop %r14 + pop %r13 + pop %r12 +L(rtn): + add R32(%rbp), R32(%rax) + neg R32(%rax) + + pop %r15 + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() +PROLOGUE(mpn_sublsh1_nc) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + push %r15 + neg %r8 C set CF + sbb R32(%rbp), R32(%rbp) C save acy + jmp L(ent) +EPILOGUE() |