diff options
author | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:36:36 +0200 |
---|---|---|
committer | Thomas Voss <mail@thomasvoss.com> | 2024-06-21 23:42:26 +0200 |
commit | a89a14ef5da44684a16b204e7a70460cc8c4922a (patch) | |
tree | b23b4c6b155977909ef508fdae2f48d33d802813 /vendor/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm | |
parent | 1db63fcedab0b288820d66e100b1877b1a5a8851 (diff) |
Basic constant folding implementation
Diffstat (limited to 'vendor/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm')
-rw-r--r-- | vendor/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/vendor/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm new file mode 100644 index 0000000..5ca128f --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/aorrlsh_n.asm @@ -0,0 +1,176 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 3.1 < 3.85 for lshift + add_n +C AMD K10 3.1 < 3.85 for lshift + add_n +C Intel P4 14.6 > 7.33 for lshift + add_n +C Intel core2 3.87 > 3.27 for lshift + add_n +C Intel NHM 4 > 3.75 for lshift + add_n +C Intel SBR (5.8) > 3.46 for lshift + add_n +C Intel atom (7.75) < 8.75 for lshift + add_n +C VIA nano 4.7 < 6.25 for lshift + add_n + +C This was written quickly and not optimized at all. Surely one could get +C closer to 3 c/l or perhaps even under 3 c/l. Ideas: +C 1) Use indexing to save the 3 LEA +C 2) Write reasonable feed-in code +C 3) Be more clever about register usage +C 4) Unroll more, handling CL negation, carry save/restore cost much now +C 5) Reschedule + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp', `%rdx') +define(`n', `%rcx') +define(`cnt', `%r8') + +ifdef(`OPERATION_addlsh_n',` + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %r12 + push %r13 + push %r14 + push %rbp + push %rbx + + mov n, %rax + xor R32(%rbx), R32(%rbx) C clear carry save register + mov R32(%r8), R32(%rcx) C shift count + xor R32(%rbp), R32(%rbp) C limb carry + + mov R32(%rax), R32(%r11) + and $3, R32(%r11) + je L(4) + sub $1, R32(%r11) + +L(012): mov (vp), %r8 + mov %r8, %r12 + shl R8(%rcx), %r8 + or %rbp, %r8 + neg R8(%rcx) + mov %r12, %rbp + shr R8(%rcx), %rbp + neg R8(%rcx) + add R32(%rbx), R32(%rbx) + ADCSBB (up), %r8 + mov %r8, (rp) + sbb R32(%rbx), R32(%rbx) + lea 8(up), up + lea 8(vp), vp + lea 8(rp), rp + sub $1, R32(%r11) + jnc L(012) + +L(4): sub $4, %rax + jc L(end) + + ALIGN(16) +L(top): mov (vp), %r8 + mov %r8, %r12 + mov 8(vp), %r9 + mov %r9, %r13 + mov 16(vp), %r10 + mov %r10, %r14 + mov 24(vp), %r11 + + shl R8(%rcx), %r8 + shl R8(%rcx), %r9 + shl R8(%rcx), %r10 + or %rbp, %r8 + mov %r11, %rbp + shl R8(%rcx), %r11 + + neg R8(%rcx) + + shr R8(%rcx), %r12 + shr R8(%rcx), %r13 + shr R8(%rcx), %r14 + shr R8(%rcx), %rbp C used next iteration + + or %r12, %r9 + or %r13, %r10 + or %r14, %r11 + + neg R8(%rcx) + + add R32(%rbx), R32(%rbx) C restore carry flag + + ADCSBB (up), %r8 + ADCSBB 8(up), %r9 + ADCSBB 16(up), %r10 + ADCSBB 24(up), %r11 + + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + + sbb R32(%rbx), R32(%rbx) C save carry flag + + lea 32(up), up + lea 32(vp), vp + lea 32(rp), rp + + sub $4, %rax + jnc L(top) + +L(end): add R32(%rbx), R32(%rbx) + ADCSBB $0, %rbp + mov %rbp, %rax + pop %rbx + pop %rbp + pop %r14 + pop %r13 + pop %r12 + FUNC_EXIT() + ret +EPILOGUE() |