From a89a14ef5da44684a16b204e7a70460cc8c4922a Mon Sep 17 00:00:00 2001 From: Thomas Voss Date: Fri, 21 Jun 2024 23:36:36 +0200 Subject: Basic constant folding implementation --- vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm | 153 ++++ vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm | 195 +++++ vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm | 217 ++++++ vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm | 179 +++++ vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm | 249 +++++++ vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h | 237 ++++++ vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm | 469 ++++++++++++ vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm | 436 +++++++++++ vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm | 559 ++++++++++++++ vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm | 591 +++++++++++++++ vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm | 807 +++++++++++++++++++++ 11 files changed, 4092 insertions(+) create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm create mode 100644 vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm (limited to 'vendor/gmp-6.3.0/mpn/x86_64/k8') diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm new file mode 100644 index 0000000..3e1898b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/addaddmul_1msb0.asm @@ -0,0 +1,153 @@ +dnl AMD64 mpn_addaddmul_1msb0, R = Au + Bv, u,v < 2^63. + +dnl Copyright 2008, 2021 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.167 +C AMD K10 2.167 +C Intel P4 12.0 +C Intel core2 4.0 +C Intel corei ? +C Intel atom ? +C VIA nano ? + +C TODO +C * Perhaps handle various n mod 3 sizes better. The code now is too large. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`bp_param', `%rdx') +define(`n', `%rcx') +define(`u0', `%r8') +define(`v0', `%r9') + + +define(`bp', `%rbp') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addaddmul_1msb0) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbp + + lea (ap,n,8), ap + lea (bp_param,n,8), bp + lea (rp,n,8), rp + neg n + + mov (ap,n,8), %rax + mul %r8 + mov %rax, %r11 + mov (bp,n,8), %rax + mov %rdx, %r10 + add $3, n + jns L(end) + + push %r13 + + ALIGN(16) +L(top): mul %r9 + add %rax, %r11 + mov -16(ap,n,8), %rax + adc %rdx, %r10 + mov %r11, -24(rp,n,8) + mul %r8 + add %rax, %r10 + mov -16(bp,n,8), %rax + mov $0, R32(%r13) + adc %rdx, %r13 + mul %r9 + add %rax, %r10 + mov -8(ap,n,8), %rax + adc %rdx, %r13 + mov %r10, -16(rp,n,8) + mul %r8 + add %rax, %r13 + mov -8(bp,n,8), %rax + mov $0, R32(%r11) + adc %rdx, %r11 + mul %r9 + add %rax, %r13 + adc %rdx, %r11 + mov (ap,n,8), %rax + mul %r8 + add %rax, %r11 + mov %r13, -8(rp,n,8) + mov (bp,n,8), %rax + mov $0, R32(%r10) + adc %rdx, %r10 + add $3, n + js L(top) + + pop %r13 + +L(end): mul %r9 + add %rax, %r11 + adc %rdx, %r10 + cmp $1, R32(n) + ja L(two) + mov -16(ap,n,8), %rax + mov %r11, -24(rp,n,8) + mov %r10, %r11 + jz L(one) + +L(nul): mul %r8 + add %rax, %r10 + mov -16(bp), %rax + mov $0, R32(%r11) + adc %rdx, %r11 + mul %r9 + add %rax, %r10 + mov -8(ap), %rax + adc %rdx, %r11 + mov %r10, -16(rp) +L(one): mul %r8 + add %rax, %r11 + mov -8(bp), %rax + mov $0, R32(%r10) + adc %rdx, %r10 + mul %r9 + add %rax, %r11 + adc %rdx, %r10 + +L(two): mov %r11, -8(rp) + mov %r10, %rax +L(ret): pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm new file mode 100644 index 0000000..78bcba1 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/addmul_2.asm @@ -0,0 +1,195 @@ +dnl AMD64 mpn_addmul_2 -- Multiply an n-limb vector with a 2-limb vector and +dnl add the result to a third limb vector. + +dnl Copyright 2008, 2011, 2012, 2016 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cfg cycles/limb am1+am1 +C AMD K8,K9 2.375 +C AMD K10 2.375 +C AMD bull 5.2 <- 4.6-4.75 bad +C AMD pile 4.96 <- 4.6-4.75 bad +C AMD steam ? +C AMD excavator ? +C AMD bobcat 5.75 5.0 bad +C AMD jaguar 5.9 5.2-5.4 bad +C Intel P4 15-16 +C Intel core2 4.5 4.25-4.5 bad +C Intel NHM 4.33 4.55 bad +C Intel SBR 3.4 2.93 3.24 bad +C Intel IBR 3.35 2.6 2.95 bad +C Intel HWL 3.3 2.15 2.3 bad +C Intel BWL 2.33 2.33 1.65 bad +C Intel SKL 2.37 2.21 1.64 bad +C Intel atom 20 18.7 +C Intel SLM 8 8.5 +C VIA nano 4.4 + +C This code is the result of running a code generation and optimization tool +C suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Tune feed-in and wind-down code. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_addmul_2) + FUNC_ENTRY(4) + mov n_param, n + push %rbx + push %rbp + + mov 0(vp), v0 + mov 8(vp), v1 + + mov R32(n_param), R32(%rbx) + mov (up), %rax + lea -8(up,n_param,8), up + lea -8(rp,n_param,8), rp + mul v0 + neg n + and $3, R32(%rbx) + jz L(b0) + cmp $2, R32(%rbx) + jc L(b1) + jz L(b2) + +L(b3): mov %rax, w1 + mov %rdx, w2 + xor R32(w3), R32(w3) + mov 8(up,n,8), %rax + dec n + jmp L(lo3) + +L(b2): mov %rax, w2 + mov 8(up,n,8), %rax + mov %rdx, w3 + xor R32(w0), R32(w0) + add $-2, n + jmp L(lo2) + +L(b1): mov %rax, w3 + mov 8(up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + inc n + jmp L(lo1) + +L(b0): mov $0, R32(w3) + mov %rax, w0 + mov 8(up,n,8), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + jmp L(lo0) + + ALIGN(32) +L(top): mov $0, R32(w1) + mul v0 + add %rax, w3 + mov (up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(lo1): mul v1 + add w3, (rp,n,8) + mov $0, R32(w3) + adc %rax, w0 + mov $0, R32(w2) + mov 8(up,n,8), %rax + adc %rdx, w1 + mul v0 + add %rax, w0 + mov 8(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(lo0): mul v1 + add w0, 8(rp,n,8) + adc %rax, w1 + adc %rdx, w2 + mov 16(up,n,8), %rax + mul v0 + add %rax, w1 + adc %rdx, w2 + adc $0, R32(w3) + mov 16(up,n,8), %rax +L(lo3): mul v1 + add w1, 16(rp,n,8) + adc %rax, w2 + adc %rdx, w3 + xor R32(w0), R32(w0) + mov 24(up,n,8), %rax + mul v0 + add %rax, w2 + mov 24(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) +L(lo2): mul v1 + add w2, 24(rp,n,8) + adc %rax, w3 + adc %rdx, w0 + mov 32(up,n,8), %rax + add $4, n + js L(top) + +L(end): xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov (up), %rax + adc %rdx, w0 + adc R32(w1), R32(w1) + mul v1 + add w3, (rp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm new file mode 100644 index 0000000..ff3a184 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm @@ -0,0 +1,217 @@ +dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U. + +dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.87 < 3.85 for lshift + add_n +C AMD K10 2.75 < 3.85 for lshift + add_n +C Intel P4 22 > 7.33 for lshift + add_n +C Intel core2 4.1 > 3.27 for lshift + add_n +C Intel NHM 4.4 > 3.75 for lshift + add_n +C Intel SBR 3.17 < 3.46 for lshift + add_n +C Intel atom ? ? 8.75 for lshift + add_n +C VIA nano 4.7 < 6.25 for lshift + add_n + +C TODO +C * Can we propagate carry into rdx instead of using a special carry register? +C That could save enough insns to get to 10 cycles/iteration. + +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n_param', `%rcx') +define(`cnt', `%r8') + +define(`vp', `%r12') +define(`n', `%rbp') + +ifdef(`OPERATION_addlsh_n',` + define(ADDSUB, `add') + define(ADCSBB, `adc') + define(func, mpn_addlsh_n) +') +ifdef(`OPERATION_rsblsh_n',` + define(ADDSUB, `sub') + define(ADCSBB, `sbb') + define(func, mpn_rsblsh_n) +') + +MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n) + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(func) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %r12 + push %rbp + push %rbx + + mov (vp_param), %rax C load first V limb early + + mov $0, R32(n) + sub n_param, n + + lea -16(up,n_param,8), up + lea -16(rp,n_param,8), rp + lea 16(vp_param,n_param,8), vp + + mov n_param, %r9 + + mov %r8, %rcx + mov $1, R32(%r8) + shl R8(%rcx), %r8 + + mul %r8 C initial multiply + + and $3, R32(%r9) + jz L(b0) + cmp $2, R32(%r9) + jc L(b1) + jz L(b2) + +L(b3): mov %rax, %r11 + ADDSUB 16(up,n,8), %r11 + mov -8(vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov (vp,n,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $3, n + jnz L(lo3) + jmp L(cj3) + +L(b2): mov %rax, %rbx + mov -8(vp,n,8), %rax + mov %rdx, %r9 + mul %r8 + or %rax, %r9 + add $2, n + jz L(cj2) + mov %rdx, %r10 + mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + xor R32(%rcx), R32(%rcx) C clear carry register + jmp L(lo2) + +L(b1): mov %rax, %r9 + mov %rdx, %r10 + add $1, n + jnz L(gt1) + ADDSUB 8(up,n,8), %r9 + jmp L(cj1) +L(gt1): mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + ADDSUB 8(up,n,8), %r9 + ADCSBB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + jmp L(lo1) + +L(b0): mov %rax, %r10 + mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + ADDSUB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov 8(vp,n,8), %rax + add $4, n + jz L(end) + + ALIGN(8) +L(top): mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(rp,n,8) +L(lo3): mov %rdx, %r10 + mov -16(vp,n,8), %rax + mul %r8 + or %rax, %r10 + mov %r11, -8(rp,n,8) +L(lo2): mov %rdx, %r11 + mov -8(vp,n,8), %rax + mul %r8 + or %rax, %r11 + add R32(%rcx), R32(%rcx) + ADCSBB (up,n,8), %rbx + ADCSBB 8(up,n,8), %r9 + ADCSBB 16(up,n,8), %r10 + ADCSBB 24(up,n,8), %r11 + mov (vp,n,8), %rax + sbb R32(%rcx), R32(%rcx) + mov %rbx, (rp,n,8) +L(lo1): mov %rdx, %rbx + mul %r8 + or %rax, %rbx + mov %r9, 8(rp,n,8) +L(lo0): mov 8(vp,n,8), %rax + add $4, n + jnz L(top) + +L(end): mov %rdx, %r9 + mul %r8 + or %rax, %r9 + mov %r10, -16(rp,n,8) +L(cj3): mov %r11, -8(rp,n,8) +L(cj2): add R32(%rcx), R32(%rcx) + ADCSBB (up,n,8), %rbx + ADCSBB 8(up,n,8), %r9 + mov %rbx, (rp,n,8) +L(cj1): mov %r9, 8(rp,n,8) + mov %rdx, %rax + ADCSBB $0, %rax + pop %rbx + pop %rbp + pop %r12 + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm new file mode 100644 index 0000000..1172b0d --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/bdiv_q_1.asm @@ -0,0 +1,179 @@ +dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor, +dnl returning quotient only. + +dnl Copyright 2001, 2002, 2004-2006, 2009, 2011, 2012, 2017 Free Software +dnl Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm/unorm +C AMD K8,K9 10 + +C AMD K10 10 + +C AMD bull 13.7 - +C AMD pile 13.7 + +C AMD steam +C AMD excavator +C AMD bobcat 15 - +C AMD jaguar 16 - +C Intel P4 33 = +C Intel core2 13.25 = +C Intel NHM 14 = +C Intel SBR 8.5 - +C Intel IBR 8.5 - +C Intel HWL 8 = +C Intel BWL 8 = +C Intel SKL 8 = +C Intel atom 42 -- +C Intel SLM 20.4 -- +C VIA nano + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n', `%rdx') +define(`d', `%rcx') +define(`di', `%r8') C just mpn_pi1_bdiv_q_1 +define(`ncnt', `%r9') C just mpn_pi1_bdiv_q_1 + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) + FUNC_ENTRY(4) + push %rbx + + mov %rcx, %rax + xor R32(%rcx), R32(%rcx) C ncnt count + mov %rdx, %r10 + + bt $0, R32(%rax) + jnc L(evn) C skip bsf unless divisor is even + +L(odd): mov %rax, %rbx + shr R32(%rax) + and $127, R32(%rax) C d/2, 7 bits + + LEA( binvert_limb_table, %rdx) + + movzbl (%rdx,%rax), R32(%rax) C inv 8 bits + + mov %rbx, %r11 C d without twos + + lea (%rax,%rax), R32(%rdx) C 2*inv + imul R32(%rax), R32(%rax) C inv*inv + imul R32(%rbx), R32(%rax) C inv*inv*d + sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rdx,%rdx), R32(%rax) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + imul R32(%rbx), R32(%rdx) C inv*inv*d + sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits + + lea (%rax,%rax), %r8 C 2*inv + imul %rax, %rax C inv*inv + imul %rbx, %rax C inv*inv*d + sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits + + jmp L(pi1) + +L(evn): bsf %rax, %rcx + shr R8(%rcx), %rax + jmp L(odd) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + push %rbx + + mov %rcx, %r11 C d + mov %rdx, %r10 C n + mov %r9, %rcx C ncnt + +L(pi1): mov (up), %rax C up[0] + + dec %r10 + jz L(one) + + mov 8(up), %rdx C up[1] + lea (up,%r10,8), up C up end + lea (rp,%r10,8), rp C rp end + neg %r10 C -n + + shrd R8(%rcx), %rdx, %rax + + xor R32(%rbx), R32(%rbx) + jmp L(ent) + + ALIGN(8) +L(top): + C rax q + C rbx carry bit, 0 or 1 + C rcx ncnt + C rdx + C r10 counter, limbs, negative + C r11 d + + mul %r11 C carry limb in rdx + mov (up,%r10,8), %rax + mov 8(up,%r10,8), %r9 + shrd R8(%rcx), %r9, %rax + nop + sub %rbx, %rax C apply carry bit + setc R8(%rbx) + sub %rdx, %rax C apply carry limb + adc $0, R32(%rbx) +L(ent): imul %r8, %rax + mov %rax, (rp,%r10,8) + inc %r10 + jnz L(top) + + mul %r11 C carry limb in rdx + mov (up), %rax C up high limb + shr R8(%rcx), %rax + sub %rbx, %rax C apply carry bit + sub %rdx, %rax C apply carry limb + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret + +L(one): shr R8(%rcx), %rax + imul %r8, %rax + mov %rax, (rp) + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm new file mode 100644 index 0000000..86de08c --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/div_qr_1n_pi1.asm @@ -0,0 +1,249 @@ +dnl x86-64 mpn_div_qr_1n_pi1 +dnl -- Divide an mpn number by a normalized single-limb number, +dnl using a single-limb inverse. + +dnl Contributed to the GNU project by Niels Möller + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C AMD K8,K9 11 +C AMD K10 11 +C AMD bull 16 +C AMD pile 14.25 +C AMD steam ? +C AMD bobcat 16 +C AMD jaguar ? +C Intel P4 47.5 poor +C Intel core 28.5 very poor +C Intel NHM 29 very poor +C Intel SBR 16 poor +C Intel IBR 13.5 +C Intel HWL 12 +C Intel BWL ? +C Intel atom 53 very poor +C VIA nano 19 + + +C INPUT Parameters +define(`QP', `%rdi') +define(`UP', `%rsi') +define(`UN_INPUT', `%rdx') +define(`U1', `%rcx') C Also in %rax +define(`D', `%r8') +define(`DINV', `%r9') + +C Invariants +define(`B2', `%rbp') +define(`B2md', `%rbx') + +C Variables +define(`UN', `%r8') C Overlaps D input +define(`T', `%r10') +define(`U0', `%r11') +define(`U2', `%r12') +define(`Q0', `%r13') +define(`Q1', `%r14') +define(`Q2', `%r15') + +ABI_SUPPORT(STD64) + + ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_div_qr_1n_pi1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') + dec UN_INPUT + jnz L(first) + + C Just a single 2/1 division. + C T, U0 are allocated in scratch registers + lea 1(U1), T + mov U1, %rax + mul DINV + mov (UP), U0 + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(single_div_done) + sub D, %rax + add $1, T +L(single_div_done): + mov T, (QP) + FUNC_EXIT() + ret +L(first): + C FIXME: Could delay some of these until we enter the loop. + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + push %rbp + + mov D, B2 + imul DINV, B2 + neg B2 + mov B2, B2md + sub D, B2md + + C D not needed until final reduction + push D + mov UN_INPUT, UN C Clobbers D + + mov DINV, %rax + mul U1 + mov %rax, Q0 + add U1, %rdx + mov %rdx, T + + mov B2, %rax + mul U1 + mov -8(UP, UN, 8), U0 + mov (UP, UN, 8), U1 + mov T, (QP, UN, 8) + add %rax, U0 + adc %rdx, U1 + sbb U2, U2 + dec UN + mov U1, %rax + jz L(final) + mov $0, R32(Q1) + + ALIGN(16) + + C Loop is 28 instructions, 30 K8/K10 decoder slots, should run + C in 10 cycles. At entry, %rax holds an extra copy of U1, Q1 + C is zero, and carry holds an extra copy of U2. +L(loop): + C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2 + C Remains to add in B (U1 + c) + cmovc DINV, Q1 + mov U2, Q2 + neg Q2 + mul DINV + add %rdx, Q1 + adc $0, Q2 + add Q0, Q1 + mov %rax, Q0 + mov B2, %rax + lea (B2md, U0), T + adc $0, Q2 + + C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u + mul U1 + and B2, U2 + add U2, U0 + cmovnc U0, T + + C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c + adc U1, Q1 + mov -8(UP, UN, 8), U0 + adc Q2, 8(QP, UN, 8) + jc L(q_incr) +L(q_incr_done): + add %rax, U0 + mov T, %rax + adc %rdx, %rax + mov Q1, (QP, UN, 8) + mov $0, R32(Q1) + sbb U2, U2 + dec UN + mov %rax, U1 + jnz L(loop) + +L(final): + pop D + + mov U2, Q1 + and D, U2 + sub U2, %rax + neg Q1 + + mov %rax, U1 + sub D, %rax + cmovc U1, %rax + sbb $-1, Q1 + + lea 1(%rax), T + mul DINV + add U0, %rax + adc T, %rdx + mov %rdx, T + imul D, %rdx + sub %rdx, U0 + cmp U0, %rax + lea (U0, D), %rax + cmovnc U0, %rax + sbb $0, T + cmp D, %rax + jc L(div_done) + sub D, %rax + add $1, T +L(div_done): + add T, Q0 + mov Q0, (QP) + adc Q1, 8(QP) + jnc L(done) +L(final_q_incr): + addq $1, 16(QP) + lea 8(QP), QP + jc L(final_q_incr) + +L(done): + pop %rbp + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(q_incr): + C U1 is not live, so use it for indexing + lea 16(QP, UN, 8), U1 +L(q_incr_loop): + addq $1, (U1) + jnc L(q_incr_done) + lea 8(U1), U1 + jmp L(q_incr_loop) +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h b/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h new file mode 100644 index 0000000..d87cc3b --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/gmp-mparam.h @@ -0,0 +1,237 @@ +/* AMD K8 gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 64 +#define GMP_LIMB_BYTES 8 + +#if 0 +#undef mpn_sublsh_n +#define mpn_sublsh_n(rp,up,vp,n,c) \ + (((rp) == (up)) ? mpn_submul_1 (rp, vp, n, CNST_LIMB(1) << (c)) \ + : MPN(mpn_sublsh_n)(rp,up,vp,n,c)) +#endif + +/* 2500 MHz K8 Brisbane */ +/* FFT tuning limit = 115,768,433 */ +/* Generated by tuneup.c, 2019-10-18, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 14 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 35 +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 9 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1_NORM_THRESHOLD 1 +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 16 + +#define DIV_1_VS_MUL_1_PERCENT 309 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 81 +#define MUL_TOOM44_THRESHOLD 232 +#define MUL_TOOM6H_THRESHOLD 324 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 97 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 153 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 34 +#define SQR_TOOM3_THRESHOLD 114 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 430 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 36 + +#define MULMOD_BNM1_THRESHOLD 17 +#define SQRMOD_BNM1_THRESHOLD 19 + +#define MUL_FFT_MODF_THRESHOLD 654 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 654, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 27, 7}, { 14, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 29, 8}, { 15, 7}, { 32, 8}, \ + { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 44, 8}, { 23, 7}, { 47, 8}, \ + { 25, 7}, { 51, 8}, { 31, 7}, { 63, 8}, \ + { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 53, 9}, { 27, 8}, { 57, 9}, { 31, 8}, \ + { 67, 9}, { 35, 8}, { 71, 9}, { 39, 8}, \ + { 81, 9}, { 43,10}, { 23, 9}, { 55, 8}, \ + { 111,10}, { 31, 9}, { 71,10}, { 39, 9}, \ + { 87,10}, { 47, 9}, { 99,10}, { 55, 9}, \ + { 111,11}, { 31,10}, { 63, 9}, { 131,10}, \ + { 71, 9}, { 147,10}, { 87,11}, { 47,10}, \ + { 111,11}, { 63,10}, { 143,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 199,11}, { 111,12}, \ + { 63,11}, { 143,10}, { 287,11}, { 159,12}, \ + { 95,11}, { 191,10}, { 383,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,11}, { 255,10}, \ + { 511,11}, { 271,10}, { 543,11}, { 287,12}, \ + { 159,11}, { 319,10}, { 639,11}, { 335,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 415,12}, \ + { 223,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 671,12}, { 351,11}, \ + { 703,13}, { 191,12}, { 383,11}, { 767,12}, \ + { 415,11}, { 831,12}, { 447,11}, { 895,12}, \ + { 479,14}, { 127,13}, { 255,12}, { 543,11}, \ + { 1087,12}, { 575,11}, { 1151,12}, { 607,13}, \ + { 319,12}, { 735,13}, { 383,12}, { 831,13}, \ + { 447,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1087,13}, { 575,12}, { 1215,13}, { 639,12}, \ + { 1279,13}, { 703,12}, { 1407,14}, { 383,13}, \ + { 767,12}, { 1535,13}, { 831,12}, { 1663,13}, \ + { 959,15}, { 255,14}, { 511,13}, { 1215,14}, \ + { 639,13}, { 1471,14}, { 767,13}, { 1663,14}, \ + { 895,13}, { 1855,15}, { 511,14}, { 1023,13}, \ + { 2047,14}, { 1151,13}, { 2367,14}, { 1407,15}, \ + { 767,14}, { 1791,16}, { 511,15}, { 1023,14}, \ + { 2303,15}, { 1279,14}, { 2687,15}, { 1535,14}, \ + { 3199,15}, { 1791,16}, { 1023,15}, { 2047,14}, \ + { 4223,15}, { 2303,14}, { 4735,15}, { 2559,16}, \ + { 1535,15}, { 3071,14}, { 6271,15}, { 3327,17}, \ + { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ + {2097152,22}, {4194304,23}, {8388608,24} } +#define MUL_FFT_TABLE3_SIZE 183 +#define MUL_FFT_THRESHOLD 11520 + +#define SQR_FFT_MODF_THRESHOLD 540 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 540, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \ + { 12, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 31, 7}, { 16, 6}, { 33, 7}, { 33, 8}, \ + { 17, 7}, { 37, 8}, { 19, 7}, { 39, 8}, \ + { 21, 7}, { 43, 8}, { 23, 7}, { 47, 8}, \ + { 25, 7}, { 51, 8}, { 29, 9}, { 15, 8}, \ + { 37, 9}, { 19, 8}, { 43, 9}, { 23, 8}, \ + { 51, 9}, { 27, 8}, { 55, 9}, { 31, 8}, \ + { 65, 9}, { 35, 8}, { 71, 9}, { 43,10}, \ + { 23, 9}, { 55,10}, { 31, 9}, { 71,10}, \ + { 39, 9}, { 83,10}, { 47, 9}, { 99,10}, \ + { 55, 9}, { 111,11}, { 31,10}, { 63, 9}, \ + { 127,10}, { 87,11}, { 47,10}, { 111,12}, \ + { 31,11}, { 63,10}, { 143,11}, { 79,10}, \ + { 167,11}, { 95,10}, { 191,11}, { 111,12}, \ + { 63,11}, { 127, 9}, { 511,11}, { 143,10}, \ + { 287, 9}, { 575,11}, { 159,12}, { 95,11}, \ + { 191,10}, { 383, 9}, { 767,11}, { 207,10}, \ + { 415,13}, { 63,12}, { 127,10}, { 511, 9}, \ + { 1023,11}, { 271,10}, { 543, 9}, { 1087,11}, \ + { 287,10}, { 575,12}, { 159,11}, { 319,10}, \ + { 639,11}, { 335,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 767,11}, \ + { 415,10}, { 831,12}, { 223,11}, { 447,13}, \ + { 127,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,12}, { 287,11}, { 575,10}, { 1151,11}, \ + { 607,12}, { 319,11}, { 639,10}, { 1279,11}, \ + { 671,12}, { 351,11}, { 703,13}, { 191,12}, \ + { 383,11}, { 767,12}, { 415,11}, { 831,12}, \ + { 447,11}, { 895,14}, { 127,12}, { 511,11}, \ + { 1023,12}, { 543,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 607,11}, { 1215,13}, { 319,12}, \ + { 639,11}, { 1279,12}, { 671,11}, { 1343,12}, \ + { 703,11}, { 1407,12}, { 735,13}, { 383,12}, \ + { 767,11}, { 1535,12}, { 831,13}, { 447,12}, \ + { 959,13}, { 511,12}, { 1087,13}, { 575,12}, \ + { 1215,13}, { 639,12}, { 1343,13}, { 703,12}, \ + { 1407,14}, { 383,13}, { 767,12}, { 1535,13}, \ + { 831,12}, { 1663,13}, { 895,12}, { 1791,13}, \ + { 959,14}, { 511,13}, { 1215,14}, { 639,13}, \ + { 1471,14}, { 767,13}, { 1663,14}, { 895,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2111,14}, \ + { 1151,13}, { 2303,14}, { 1407,15}, { 767,14}, \ + { 1791,16}, { 511,15}, { 1023,14}, { 2303,15}, \ + { 1279,14}, { 2687,15}, { 1535,14}, { 3199,15}, \ + { 1791,16}, { 1023,15}, { 2047,14}, { 4223,15}, \ + { 2303,14}, { 4863,15}, { 2559,16}, { 1535,15}, \ + { 3071,14}, { 6271,15}, { 3327,17}, { 131072,18}, \ + { 262144,19}, { 524288,20}, {1048576,21}, {2097152,22}, \ + {4194304,23}, {8388608,24} } +#define SQR_FFT_TABLE3_SIZE 202 +#define SQR_FFT_THRESHOLD 7296 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 61 +#define MULLO_MUL_N_THRESHOLD 22239 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 0 /* never mpn_sqrlo_basecase */ +#define SQRLO_SQR_THRESHOLD 14281 + +#define DC_DIV_QR_THRESHOLD 47 +#define DC_DIVAPPR_Q_THRESHOLD 266 +#define DC_BDIV_QR_THRESHOLD 38 +#define DC_BDIV_Q_THRESHOLD 104 + +#define INV_MULMOD_BNM1_THRESHOLD 54 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 250 + +#define BINV_NEWTON_THRESHOLD 258 +#define REDC_1_TO_REDC_2_THRESHOLD 35 +#define REDC_2_TO_REDC_N_THRESHOLD 79 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 1895 +#define MUPI_DIV_QR_THRESHOLD 99 +#define MU_BDIV_QR_THRESHOLD 1787 +#define MU_BDIV_Q_THRESHOLD 1895 + +#define POWM_SEC_TABLE 1,16,194,960,2825 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 26 +#define SET_STR_DC_THRESHOLD 248 +#define SET_STR_PRECOMPUTE_THRESHOLD 1747 + +#define FAC_DSC_THRESHOLD 1240 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 21 +#define HGCD2_DIV1_METHOD 3 /* 4.10% faster than 5 */ +#define HGCD_THRESHOLD 141 +#define HGCD_APPR_THRESHOLD 181 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 622 +#define GCDEXT_DC_THRESHOLD 496 +#define JACOBI_BASE_METHOD 1 /* 0.97% faster than 3 */ + +/* Tuneup completed successfully, took 131832 seconds */ diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm new file mode 100644 index 0000000..ca2efb9 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mul_basecase.asm @@ -0,0 +1,469 @@ +dnl AMD64 mpn_mul_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. + +dnl Copyright 2008, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 2.375 +C AMD K10 2.375 +C Intel P4 15-16 +C Intel core2 4.45 +C Intel corei 4.35 +C Intel atom ? +C VIA nano 4.5 + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Use fewer registers. (how??? I can't see it -- david) +C * Avoid some "mov $0,r" and instead use "xor r,r". +C * Can the top of each L(addmul_outer_n) prologue be folded into the +C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the +C case where vn = 1 or 2; is it worth it? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%r15') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n', `%r11') +define(`outer_addr', `%r14') +define(`un', `%r13') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + xor R32(un), R32(un) + mov (up), %rax + mov (vp), v0 + + sub un_param, un C rdx used by mul + mov un, n + mov R32(un_param), R32(w0) + + lea (rp,un_param,8), rp + lea (up,un_param,8), up + + mul v0 + + test $1, R8(vn) + jz L(mul_2) + +C =========================================================== +C mul_1 for vp[0] if vn is odd + +L(mul_1): + and $3, R32(w0) + jz L(mul_1_prologue_0) + cmp $2, R32(w0) + jc L(mul_1_prologue_1) + jz L(mul_1_prologue_2) + +L(mul_1_prologue_3): + add $-1, n + lea L(addmul_outer_3)(%rip), outer_addr + mov %rax, w3 + mov %rdx, w0 + jmp L(mul_1_entry_3) + +L(mul_1_prologue_0): + mov %rax, w2 + mov %rdx, w3 C note: already w0 == 0 + lea L(addmul_outer_0)(%rip), outer_addr + jmp L(mul_1_entry_0) + +L(mul_1_prologue_1): + cmp $-1, un + jne 2f + mov %rax, -8(rp) + mov %rdx, (rp) + jmp L(ret) +2: add $1, n + lea L(addmul_outer_1)(%rip), outer_addr + mov %rax, w1 + mov %rdx, w2 + xor R32(w3), R32(w3) + mov (up,n,8), %rax + jmp L(mul_1_entry_1) + +L(mul_1_prologue_2): + add $-2, n + lea L(addmul_outer_2)(%rip), outer_addr + mov %rax, w0 + mov %rdx, w1 + mov 24(up,n,8), %rax + xor R32(w2), R32(w2) + xor R32(w3), R32(w3) + jmp L(mul_1_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,n,8) + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 +L(mul_1_entry_1): + xor R32(w0), R32(w0) + mul v0 + mov w1, -8(rp,n,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,n,8), %rax + mul v0 + mov w2, (rp,n,8) + add %rax, w3 + adc %rdx, w0 +L(mul_1_entry_3): + mov 16(up,n,8), %rax + mul v0 + mov w3, 8(rp,n,8) + xor R32(w2), R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,n,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, n + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + adc %rdx, w2 + mov w2, (rp) + + add $-1, vn C vn -= 1 + jz L(ret) + + mov 8(vp), v0 + mov 16(vp), v1 + + lea 8(vp), vp C vp += 1 + lea 8(rp), rp C rp += 1 + + jmp *outer_addr + +C =========================================================== +C mul_2 for vp[0], vp[1] if vn is even + + ALIGN(16) +L(mul_2): + mov 8(vp), v1 + + and $3, R32(w0) + jz L(mul_2_prologue_0) + cmp $2, R32(w0) + jz L(mul_2_prologue_2) + jc L(mul_2_prologue_1) + +L(mul_2_prologue_3): + lea L(addmul_outer_3)(%rip), outer_addr + add $2, n + mov %rax, -16(rp,n,8) + mov %rdx, w2 + xor R32(w3), R32(w3) + xor R32(w0), R32(w0) + mov -16(up,n,8), %rax + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_prologue_0): + add $3, n + mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov -24(up,n,8), %rax + lea L(addmul_outer_0)(%rip), outer_addr + jmp L(mul_2_entry_0) + + ALIGN(16) +L(mul_2_prologue_1): + mov %rax, w3 + mov %rdx, w0 + xor R32(w1), R32(w1) + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_2): + add $1, n + lea L(addmul_outer_2)(%rip), outer_addr + mov $0, R32(w0) + mov $0, R32(w1) + mov %rax, w2 + mov -8(up,n,8), %rax + mov %rdx, w3 + jmp L(mul_2_entry_2) + + C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(mul_2_top): + mov -32(up,n,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,n,8), %rax + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(mul_2_entry_0): + mul v1 + add %rax, w1 + mov w0, -24(rp,n,8) + adc %rdx, w2 + mov -16(up,n,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,n,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(rp,n,8) +L(mul_2_entry_3): + mul v1 + add %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + adc R32(w1), R32(w0) C adc $0, w0 +L(mul_2_entry_2): + mul v1 + add %rax, w3 + mov w2, -8(rp,n,8) + adc %rdx, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) +L(mul_2_entry_1): + add $4, n + mov w3, -32(rp,n,8) + js L(mul_2_top) + + mov -32(up,n,8), %rax C FIXME: n is constant + mul v1 + add %rax, w0 + mov w0, (rp) + adc %rdx, w1 + mov w1, 8(rp) + + add $-2, vn C vn -= 2 + jz L(ret) + + mov 16(vp), v0 + mov 24(vp), v1 + + lea 16(vp), vp C vp += 2 + lea 16(rp), rp C rp += 2 + + jmp *outer_addr + + +C =========================================================== +C addmul_2 for remaining vp's + + C in the following prologues, we reuse un to store the + C adjusted value of n that is reloaded on each iteration + +L(addmul_outer_0): + add $3, un + lea 0(%rip), outer_addr + + mov un, n + mov -24(up,un,8), %rax + mul v0 + mov %rax, w0 + mov -24(up,un,8), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + jmp L(addmul_entry_0) + +L(addmul_outer_1): + mov un, n + mov (up,un,8), %rax + mul v0 + mov %rax, w3 + mov (up,un,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + jmp L(addmul_entry_1) + +L(addmul_outer_2): + add $1, un + lea 0(%rip), outer_addr + + mov un, n + mov -8(up,un,8), %rax + mul v0 + xor R32(w0), R32(w0) + mov %rax, w2 + xor R32(w1), R32(w1) + mov %rdx, w3 + mov -8(up,un,8), %rax + jmp L(addmul_entry_2) + +L(addmul_outer_3): + add $2, un + lea 0(%rip), outer_addr + + mov un, n + mov -16(up,un,8), %rax + xor R32(w3), R32(w3) + mul v0 + mov %rax, w1 + mov -16(up,un,8), %rax + mov %rdx, w2 + jmp L(addmul_entry_3) + + C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(addmul_top): + add w3, -32(rp,n,8) + adc %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + adc R32(w2), R32(w2) C adc $0, w2 +L(addmul_entry_0): + mul v1 + xor R32(w3), R32(w3) + add w0, -24(rp,n,8) + adc %rax, w1 + mov -16(up,n,8), %rax + adc %rdx, w2 + mul v0 + add %rax, w1 + mov -16(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(addmul_entry_3): + mul v1 + add w1, -16(rp,n,8) + adc %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + mul v0 + xor R32(w0), R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov -8(up,n,8), %rax + adc R32(w1), R32(w0) C adc $0, w0 +L(addmul_entry_2): + mul v1 + add w2, -8(rp,n,8) + adc %rax, w3 + adc %rdx, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w3 + mov (up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(addmul_entry_1): + mul v1 + add $4, n + js L(addmul_top) + + add w3, -8(rp) + adc %rax, w0 + mov w0, (rp) + adc %rdx, w1 + mov w1, 8(rp) + + add $-2, vn C vn -= 2 + jz L(ret) + + lea 16(rp), rp C rp += 2 + lea 16(vp), vp C vp += 2 + + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + + ALIGN(16) +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret + +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm new file mode 100644 index 0000000..fa00f42 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mullo_basecase.asm @@ -0,0 +1,436 @@ +dnl AMD64 mpn_mullo_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjorn Granlund. + +C NOTES +C * There is a major stupidity in that we call mpn_mul_1 initially, for a +C large trip count. Instead, we should start with mul_2 for any operand +C size congruence class. +C * Stop iterating addmul_2 earlier, falling into straight-line triangle code +C for the last 2-3 iterations. +C * Perhaps implement n=4 special code. +C * The reload of the outer loop jump address hurts branch prediction. +C * The addmul_2 loop ends with an MUL whose high part is not used upon loop +C exit. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`vp_param', `%rdx') +define(`n', `%rcx') + +define(`vp', `%r11') +define(`outer_addr', `%r8') +define(`j', `%r9') +define(`v0', `%r13') +define(`v1', `%r14') +define(`w0', `%rbx') +define(`w1', `%r15') +define(`w2', `%rbp') +define(`w3', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mullo_basecase) + FUNC_ENTRY(4) + cmp $4, n + jge L(gen) + mov (up), %rax C u0 + mov (vp_param), %r8 C v0 + + lea L(tab)(%rip), %r9 +ifdef(`PIC', +` movslq (%r9,%rcx,4), %r10 + add %r10, %r9 + jmp *%r9 +',` + jmp *(%r9,n,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(tab), L(tab)) C not allowed + JMPENT( L(1), L(tab)) C 1 + JMPENT( L(2), L(tab)) C 2 + JMPENT( L(3), L(tab)) C 3 +dnl JMPENT( L(0m4), L(tab)) C 4 +dnl JMPENT( L(1m4), L(tab)) C 5 +dnl JMPENT( L(2m4), L(tab)) C 6 +dnl JMPENT( L(3m4), L(tab)) C 7 +dnl JMPENT( L(0m4), L(tab)) C 8 +dnl JMPENT( L(1m4), L(tab)) C 9 +dnl JMPENT( L(2m4), L(tab)) C 10 +dnl JMPENT( L(3m4), L(tab)) C 11 + TEXT + +L(1): imul %r8, %rax + mov %rax, (rp) + FUNC_EXIT() + ret + +L(2): mov 8(vp_param), %r11 + imul %rax, %r11 C u0 x v1 + mul %r8 C u0 x v0 + mov %rax, (rp) + imul 8(up), %r8 C u1 x v0 + lea (%r11, %rdx), %rax + add %r8, %rax + mov %rax, 8(rp) + FUNC_EXIT() + ret + +L(3): mov 8(vp_param), %r9 C v1 + mov 16(vp_param), %r11 + mul %r8 C u0 x v0 -> + mov %rax, (rp) C r0 + mov (up), %rax C u0 + mov %rdx, %rcx C r1 + mul %r9 C u0 x v1 -> + imul 8(up), %r9 C u1 x v1 -> r2 + mov 16(up), %r10 + imul %r8, %r10 C u2 x v0 -> r2 + add %rax, %rcx + adc %rdx, %r9 + add %r10, %r9 + mov 8(up), %rax C u1 + mul %r8 C u1 x v0 -> + add %rax, %rcx + adc %rdx, %r9 + mov %r11, %rax + imul (up), %rax C u0 x v2 -> r2 + add %rax, %r9 + mov %rcx, 8(rp) + mov %r9, 16(rp) + FUNC_EXIT() + ret + +L(0m4): +L(1m4): +L(2m4): +L(3m4): +L(gen): push %rbx + push %rbp + push %r13 + push %r14 + push %r15 + + mov (up), %rax + mov (vp_param), v0 + mov vp_param, vp + + lea (rp,n,8), rp + lea (up,n,8), up + neg n + + mul v0 + + test $1, R8(n) + jz L(mul_2) + +L(mul_1): + lea -8(rp), rp + lea -8(up), up + test $2, R8(n) + jnz L(mul_1_prologue_3) + +L(mul_1_prologue_2): C n = 7, 11, 15, ... + lea -1(n), j + lea L(addmul_outer_1)(%rip), outer_addr + mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + xor R32(w3), R32(w3) + mov 16(up,n,8), %rax + jmp L(mul_1_entry_2) + +L(mul_1_prologue_3): C n = 5, 9, 13, ... + lea 1(n), j + lea L(addmul_outer_3)(%rip), outer_addr + mov %rax, w2 + mov %rdx, w3 + xor R32(w0), R32(w0) + jmp L(mul_1_entry_0) + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,j,8) + add %rax, w1 + mov (up,j,8), %rax + adc %rdx, w2 + xor R32(w0), R32(w0) + mul v0 + mov w1, -8(rp,j,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,j,8), %rax + mul v0 + mov w2, (rp,j,8) + add %rax, w3 + adc %rdx, w0 + mov 16(up,j,8), %rax + mul v0 + mov w3, 8(rp,j,8) + xor R32(w2), R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,j,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, j + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + adc %rdx, w2 + + imul (up), v0 + add v0, w2 + mov w2, (rp) + + add $1, n + jz L(ret) + + mov 8(vp), v0 + mov 16(vp), v1 + + lea 16(up), up + lea 8(vp), vp + lea 24(rp), rp + + jmp *outer_addr + + +L(mul_2): + mov 8(vp), v1 + test $2, R8(n) + jz L(mul_2_prologue_3) + + ALIGN(16) +L(mul_2_prologue_1): + lea 0(n), j + mov %rax, w3 + mov %rdx, w0 + xor R32(w1), R32(w1) + mov (up,n,8), %rax + lea L(addmul_outer_3)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_3): + lea 2(n), j + mov $0, R32(w3) + mov %rax, w1 + mov (up,n,8), %rax + mov %rdx, w2 + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_top): + mov -32(up,j,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add %rax, w1 + mov w0, -24(rp,j,8) + adc %rdx, w2 + mov -16(up,j,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) +L(mul_2_entry_3): + mov $0, R32(w0) + mov w1, -16(rp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc R32(w1), R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(rp,j,8) + adc %rdx, w0 + mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) +L(mul_2_entry_1): + add $4, j + mov w3, -32(rp,j,8) + js L(mul_2_top) + + imul -16(up), v1 + add v1, w0 + imul -8(up), v0 + add v0, w0 + mov w0, -8(rp) + + add $2, n + jz L(ret) + + mov 16(vp), v0 + mov 24(vp), v1 + + lea 16(vp), vp + lea 16(rp), rp + + jmp *outer_addr + + +L(addmul_outer_1): + lea -2(n), j + mov -16(up,n,8), %rax + mul v0 + mov %rax, w3 + mov -16(up,n,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + lea L(addmul_outer_3)(%rip), outer_addr + jmp L(addmul_entry_1) + +L(addmul_outer_3): + lea 0(n), j + mov -16(up,n,8), %rax + xor R32(w3), R32(w3) + mul v0 + mov %rax, w1 + mov -16(up,n,8), %rax + mov %rdx, w2 + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(addmul_entry_3) + + ALIGN(16) +L(addmul_top): + add w3, -32(rp,j,8) + adc %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc R32(w2), R32(w2) + mul v1 + xor R32(w3), R32(w3) + add w0, -24(rp,j,8) + adc %rax, w1 + mov -16(up,j,8), %rax + adc %rdx, w2 + mul v0 + add %rax, w1 + mov -16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(addmul_entry_3): + mul v1 + add w1, -16(rp,j,8) + adc %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mul v0 + xor R32(w0), R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov -8(up,j,8), %rax + adc R32(w1), R32(w0) + mul v1 + add w2, -8(rp,j,8) + adc %rax, w3 + adc %rdx, w0 + mov (up,j,8), %rax + mul v0 + add %rax, w3 + mov (up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(addmul_entry_1): + mul v1 + add $4, j + js L(addmul_top) + + add w3, -32(rp) + adc %rax, w0 + + imul -24(up), v0 + add v0, w0 + add w0, -24(rp) + + add $2, n + jns L(ret) + + lea 16(vp), vp + + mov (vp), v0 + mov 8(vp), v1 + + lea -16(up), up + + jmp *outer_addr + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm new file mode 100644 index 0000000..86f1414 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/mulmid_basecase.asm @@ -0,0 +1,559 @@ +dnl AMD64 mpn_mulmid_basecase + +dnl Contributed by David Harvey. + +dnl Copyright 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb +C K8,K9: 2.375 (2.5 when un - vn is "small") +C K10: ? +C P4: ? +C P6-15: ? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp_param',`%rcx') +define(`vn', `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n', `%r11') +define(`outer_addr', `%r14') +define(`un', `%r13') +define(`vp', `%r15') + +define(`vp_inner', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mulmid_basecase) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8d ') + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov vp_param, vp + + C use un for row length (= un_param - vn + 1) + lea 1(un_param), un + sub vn, un + + lea (rp,un,8), rp + + cmp $4, un C TODO: needs tuning + jc L(diagonal) + + lea (up,un_param,8), up + + test $1, vn + jz L(mul_2) + +C =========================================================== +C mul_1 for vp[0] if vn is odd + +L(mul_1): + mov R32(un), R32(w0) + + neg un + mov (up,un,8), %rax + mov (vp), v0 + mul v0 + + and $-4, un C round down to multiple of 4 + mov un, n + + and $3, R32(w0) + jz L(mul_1_prologue_0) + cmp $2, R32(w0) + jc L(mul_1_prologue_1) + jz L(mul_1_prologue_2) + +L(mul_1_prologue_3): + mov %rax, w3 + mov %rdx, w0 + lea L(addmul_prologue_3)(%rip), outer_addr + jmp L(mul_1_entry_3) + + ALIGN(16) +L(mul_1_prologue_0): + mov %rax, w2 + mov %rdx, w3 C note already w0 == 0 + lea L(addmul_prologue_0)(%rip), outer_addr + jmp L(mul_1_entry_0) + + ALIGN(16) +L(mul_1_prologue_1): + add $4, n + mov %rax, w1 + mov %rdx, w2 + mov $0, R32(w3) + mov (up,n,8), %rax + lea L(addmul_prologue_1)(%rip), outer_addr + jmp L(mul_1_entry_1) + + ALIGN(16) +L(mul_1_prologue_2): + mov %rax, w0 + mov %rdx, w1 + mov 24(up,n,8), %rax + mov $0, R32(w2) + mov $0, R32(w3) + lea L(addmul_prologue_2)(%rip), outer_addr + jmp L(mul_1_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8 + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,n,8) + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 +L(mul_1_entry_1): + mov $0, R32(w0) + mul v0 + mov w1, -8(rp,n,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,n,8), %rax + mul v0 + mov w2, (rp,n,8) + add %rax, w3 + adc %rdx, w0 +L(mul_1_entry_3): + mov 16(up,n,8), %rax + mul v0 + mov w3, 8(rp,n,8) + mov $0, R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,n,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, n + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + mov w2, 8(rp) C zero last limb of output + adc %rdx, w2 + mov w2, (rp) + + dec vn + jz L(ret) + + lea -8(up), up + lea 8(vp), vp + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C mul_2 for vp[0], vp[1] if vn is even + + ALIGN(16) +L(mul_2): + mov R32(un), R32(w0) + + neg un + mov -8(up,un,8), %rax + mov (vp), v0 + mov 8(vp), v1 + mul v1 + + and $-4, un C round down to multiple of 4 + mov un, n + + and $3, R32(w0) + jz L(mul_2_prologue_0) + cmp $2, R32(w0) + jc L(mul_2_prologue_1) + jz L(mul_2_prologue_2) + +L(mul_2_prologue_3): + mov %rax, w1 + mov %rdx, w2 + lea L(addmul_prologue_3)(%rip), outer_addr + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_prologue_0): + mov %rax, w0 + mov %rdx, w1 + lea L(addmul_prologue_0)(%rip), outer_addr + jmp L(mul_2_entry_0) + + ALIGN(16) +L(mul_2_prologue_1): + mov %rax, w3 + mov %rdx, w0 + mov $0, R32(w1) + lea L(addmul_prologue_1)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_2): + mov %rax, w2 + mov %rdx, w3 + mov $0, R32(w0) + mov 16(up,n,8), %rax + lea L(addmul_prologue_2)(%rip), outer_addr + jmp L(mul_2_entry_2) + + + C this loop is 18 c/loop = 2.25 c/l on K8 + + ALIGN(16) +L(mul_2_top): + mov -8(up,n,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 +L(mul_2_entry_0): + mov $0, R32(w2) + mov (up,n,8), %rax + mul v0 + add %rax, w0 + mov (up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add %rax, w1 + mov w0, (rp,n,8) + adc %rdx, w2 +L(mul_2_entry_3): + mov 8(up,n,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov $0, R32(w0) + adc $0, R32(w3) + mov 8(up,n,8), %rax + mov w1, 8(rp,n,8) + mul v1 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 +L(mul_2_entry_2): + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov 16(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, 16(rp,n,8) + adc %rdx, w0 +L(mul_2_entry_1): + mov 24(up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, n + mov w3, -8(rp,n,8) + jnz L(mul_2_top) + + mov w0, (rp) + mov w1, 8(rp) + + sub $2, vn + jz L(ret) + + lea 16(vp), vp + lea -16(up), up + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C addmul_2 for remaining vp's + + ALIGN(16) +L(addmul_prologue_0): + mov -8(up,n,8), %rax + mul v1 + mov %rax, w1 + mov %rdx, w2 + mov $0, R32(w3) + jmp L(addmul_entry_0) + + ALIGN(16) +L(addmul_prologue_1): + mov 16(up,n,8), %rax + mul v1 + mov %rax, w0 + mov %rdx, w1 + mov $0, R32(w2) + mov 24(up,n,8), %rax + jmp L(addmul_entry_1) + + ALIGN(16) +L(addmul_prologue_2): + mov 8(up,n,8), %rax + mul v1 + mov %rax, w3 + mov %rdx, w0 + mov $0, R32(w1) + jmp L(addmul_entry_2) + + ALIGN(16) +L(addmul_prologue_3): + mov (up,n,8), %rax + mul v1 + mov %rax, w2 + mov %rdx, w3 + mov $0, R32(w0) + mov $0, R32(w1) + jmp L(addmul_entry_3) + + C this loop is 19 c/loop = 2.375 c/l on K8 + + ALIGN(16) +L(addmul_top): + mov $0, R32(w3) + add %rax, w0 + mov -8(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 + add w0, -8(rp,n,8) + adc %rax, w1 + adc %rdx, w2 +L(addmul_entry_0): + mov (up,n,8), %rax + mul v0 + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 + add w1, (rp,n,8) + mov $0, R32(w1) + adc %rax, w2 + mov $0, R32(w0) + adc %rdx, w3 +L(addmul_entry_3): + mov 8(up,n,8), %rax + mul v0 + add %rax, w2 + mov 8(up,n,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add w2, 8(rp,n,8) + adc %rax, w3 + adc %rdx, w0 +L(addmul_entry_2): + mov 16(up,n,8), %rax + mul v0 + add %rax, w3 + mov 16(up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add w3, 16(rp,n,8) + nop C don't ask... + adc %rax, w0 + mov $0, R32(w2) + mov 24(up,n,8), %rax + adc %rdx, w1 +L(addmul_entry_1): + mul v0 + add $4, n + jnz L(addmul_top) + + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) + + add w0, -8(rp) + adc w1, (rp) + adc w2, 8(rp) + + sub $2, vn + jz L(ret) + + lea 16(vp), vp + lea -16(up), up + + mov un, n + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + +C =========================================================== +C accumulate along diagonals if un - vn is small + + ALIGN(16) +L(diagonal): + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + xor R32(w2), R32(w2) + + neg un + + mov R32(vn), %eax + and $3, %eax + jz L(diag_prologue_0) + cmp $2, %eax + jc L(diag_prologue_1) + jz L(diag_prologue_2) + +L(diag_prologue_3): + lea -8(vp), vp + mov vp, vp_inner + add $1, vn + mov vn, n + lea L(diag_entry_3)(%rip), outer_addr + jmp L(diag_entry_3) + +L(diag_prologue_0): + mov vp, vp_inner + mov vn, n + lea 0(%rip), outer_addr + mov -8(up,n,8), %rax + jmp L(diag_entry_0) + +L(diag_prologue_1): + lea 8(vp), vp + mov vp, vp_inner + add $3, vn + mov vn, n + lea 0(%rip), outer_addr + mov -8(vp_inner), %rax + jmp L(diag_entry_1) + +L(diag_prologue_2): + lea -16(vp), vp + mov vp, vp_inner + add $2, vn + mov vn, n + lea 0(%rip), outer_addr + mov 16(vp_inner), %rax + jmp L(diag_entry_2) + + + C this loop is 10 c/loop = 2.5 c/l on K8 + + ALIGN(16) +L(diag_top): + add %rax, w0 + adc %rdx, w1 + mov -8(up,n,8), %rax + adc $0, w2 +L(diag_entry_0): + mulq (vp_inner) + add %rax, w0 + adc %rdx, w1 + adc $0, w2 +L(diag_entry_3): + mov -16(up,n,8), %rax + mulq 8(vp_inner) + add %rax, w0 + mov 16(vp_inner), %rax + adc %rdx, w1 + adc $0, w2 +L(diag_entry_2): + mulq -24(up,n,8) + add %rax, w0 + mov 24(vp_inner), %rax + adc %rdx, w1 + lea 32(vp_inner), vp_inner + adc $0, w2 +L(diag_entry_1): + mulq -32(up,n,8) + sub $4, n + jnz L(diag_top) + + add %rax, w0 + adc %rdx, w1 + adc $0, w2 + + mov w0, (rp,un,8) + + inc un + jz L(diag_end) + + mov vn, n + mov vp, vp_inner + + lea 8(up), up + mov w1, w0 + mov w2, w1 + xor R32(w2), R32(w2) + + jmp *outer_addr + +L(diag_end): + mov w1, (rp) + mov w2, 8(rp) + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm new file mode 100644 index 0000000..9327b21 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/redc_1.asm @@ -0,0 +1,591 @@ +dnl X86-64 mpn_redc_1 optimised for AMD K8-K10. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2004, 2008, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C AMD K8,K9 ? +C AMD K10 ? +C AMD bull ? +C AMD pile ? +C AMD steam ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 ? +C Intel core ? +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel atom ? +C VIA nano ? + +C The inner loops of this code are the result of running a code generation and +C optimisation tool suite written by David Harvey and Torbjörn Granlund. + +C TODO +C * Micro-optimise, none performed thus far. +C * This looks different from other current redc_1.asm variants. Consider +C adapting this to the mainstream style. +C * Is this code really faster than more approaches which compute q0 later? +C Is the use of a jump jump table faster? Or is the edge of this due to the +C inlined add_n code? +C * Put initial m[0] x q0 computation in header. +C * Put basecases at the file's end, single them out before the pushes. + +define(`rp', `%rdi') C rcx +define(`up', `%rsi') C rdx +define(`mp_param', `%rdx') C r8 +define(`n', `%rcx') C r9 +define(`u0inv', `%r8') C stack + +define(`i', `%r11') +define(`nneg', `%r12') +define(`mp', `%r13') +define(`q0', `%rbp') +define(`vp', `%rdx') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') + push %rbp + mov (up), q0 C up[0] + push %rbx + imul u0inv, q0 C first q0, for all execution paths + push %r12 + push %r13 + push %r14 + push %r15 + + mov n, nneg + neg nneg + lea (mp_param,n,8), mp C mp += n + lea -16(up,n,8), up C up += n + + mov R32(n), R32(%rax) + and $3, R32(%rax) + lea 4(%rax), %r9 + cmp $4, R32(n) + cmovg %r9, %rax + lea L(tab)(%rip), %r9 +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(0m4), L(tab)) + JMPENT( L(1m4), L(tab)) + JMPENT( L(2m4), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + + ALIGN(16) +L(1): mov (mp_param), %rax + mul q0 + add 8(up), %rax + adc 16(up), %rdx + mov %rdx, (rp) + mov $0, R32(%rax) + adc R32(%rax), R32(%rax) + jmp L(ret) + + + ALIGN(16) +L(2): mov (mp_param), %rax + mul q0 + xor R32(%r14), R32(%r14) + mov %rax, %r10 + mov -8(mp), %rax + mov %rdx, %r9 + mul q0 + add (up), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 8(up), %r9 + adc $0, %r14 + mov %r9, q0 + imul u0inv, q0 + mov -16(mp), %rax + mul q0 + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov -8(mp), %rax + mov %rdx, %r11 + mul q0 + add %r9, %r10 + adc %rax, %r11 + adc %rdx, %rbx + add 16(up), %r11 + adc $0, %rbx + xor R32(%rax), R32(%rax) + add %r11, %r14 + adc 24(up), %rbx + mov %r14, (rp) + mov %rbx, 8(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + +L(3): mov (mp_param), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add -8(up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add (up), %r10 + mov %r10, (up) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, q0 + imul u0inv, q0 + add %r9, 8(up) + adc $0, %r14 + mov %r14, -8(up) + + mov -24(mp), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add (up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add 8(up), %r10 + mov %r10, 8(up) + adc %rax, %r9 + adc %rdx, %r14 + mov %r10, q0 + imul u0inv, q0 + add %r9, 16(up) + adc $0, %r14 + mov %r14, (up) + + mov -24(mp), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov -16(mp), %rax + mul q0 + xor R32(%r9), R32(%r9) + xor R32(%r14), R32(%r14) + add 8(up), %rbx + adc %rax, %r10 + mov -8(mp), %rax + adc %rdx, %r9 + mul q0 + add 16(up), %r10 + adc %rax, %r9 + adc %rdx, %r14 + add 24(up), %r9 + adc $0, %r14 + + xor R32(%rax), R32(%rax) + add -8(up), %r10 + adc (up), %r9 + adc 32(up), %r14 + mov %r10, (rp) + mov %r9, 8(rp) + mov %r14, 16(rp) + adc R32(%rax), R32(%rax) + jmp L(ret) + + + ALIGN(16) +L(2m4): +L(lo2): mov (mp,nneg,8), %rax + mul q0 + xor R32(%r14), R32(%r14) + xor R32(%rbx), R32(%rbx) + mov %rax, %r10 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mov %rdx, %r9 + mul q0 + add 16(up,nneg,8), %r10 + adc %rax, %r9 + mov 16(mp,nneg,8), %rax + adc %rdx, %r14 + mul q0 + mov $0, R32(%r10) C xor? + lea 2(nneg), i + add %r9, %r15 + imul u0inv, %r15 + jmp L(e2) + + ALIGN(16) +L(li2): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 +L(e2): add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li2) + +L(le2): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo2) + + mov nneg, n + sar $2, n + lea 32(up,nneg,8), up + lea (up,nneg,8), vp + + mov -16(up), %r8 + mov -8(up), %r9 + add -16(vp), %r8 + adc -8(vp), %r9 + mov %r8, (rp) + mov %r9, 8(rp) + lea 16(rp), rp + jmp L(addx) + + + ALIGN(16) +L(1m4): +L(lo1): mov (mp,nneg,8), %rax + xor %r9, %r9 + xor R32(%rbx), R32(%rbx) + mul q0 + mov %rax, %r9 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mov %rdx, %r14 + mov $0, R32(%r10) C xor? + mul q0 + add 16(up,nneg,8), %r9 + adc %rax, %r14 + adc %rdx, %rbx + mov 16(mp,nneg,8), %rax + mul q0 + lea 1(nneg), i + add %r14, %r15 + imul u0inv, %r15 + jmp L(e1) + + ALIGN(16) +L(li1): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 +L(e1): add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li1) + +L(le1): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo1) + + mov nneg, n + sar $2, n + lea 24(up,nneg,8), up + lea (up,nneg,8), vp + + mov -8(up), %r8 + add -8(vp), %r8 + mov %r8, (rp) + lea 8(rp), rp + jmp L(addx) + + + ALIGN(16) +L(0): +L(0m4): +L(lo0): mov (mp,nneg,8), %rax + mov nneg, i + mul q0 + xor R32(%r10), R32(%r10) + mov %rax, %r14 + mov %rdx, %rbx + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mul q0 + add 16(up,nneg,8), %r14 + adc %rax, %rbx + adc %rdx, %r10 + add %rbx, %r15 + imul u0inv, %r15 + jmp L(e0) + + ALIGN(16) +L(li0): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 +L(e0): mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li0) + +L(le0): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + add $8, up + mov %r15, q0 + dec n + jnz L(lo0) + + mov nneg, n + sar $2, n + clc + lea 16(up,nneg,8), up + lea (up,nneg,8), vp + jmp L(addy) + + + ALIGN(16) +L(3m4): +L(lo3): mov (mp,nneg,8), %rax + mul q0 + mov %rax, %rbx + mov %rdx, %r10 + mov 8(mp,nneg,8), %rax + mov 24(up,nneg,8), %r15 + mul q0 + add 16(up,nneg,8), %rbx C result is zero, might carry + mov $0, R32(%rbx) C zero + mov %rbx, %r14 C zero + adc %rax, %r10 + mov 16(mp,nneg,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + add %r10, %r15 + mul q0 + lea 3(nneg), i + imul u0inv, %r15 +C jmp L(li3) + + ALIGN(16) +L(li3): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor R32(%r10), R32(%r10) + mul q0 + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul q0 + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul q0 + add %rbx, 24(up,i,8) + mov $0, R32(%r14) C zero + mov %r14, %rbx C zero + adc %rax, %r10 + mov 24(mp,i,8), %rax + mov %r14, %r9 C zero + adc %rdx, %r9 + mul q0 + add $4, i + js L(li3) + +L(le3): add %r10, (up) + adc %rax, %r9 + adc %r14, %rdx + add %r9, 8(up) + adc $0, %rdx + mov %rdx, 16(up,nneg,8) C up[0] + mov %r15, q0 + lea 8(up), up + dec n + jnz L(lo3) + + +C ==== Addition code ==== + mov nneg, n + sar $2, n + lea 40(up,nneg,8), up + lea (up,nneg,8), vp + + mov -24(up), %r8 + mov -16(up), %r9 + mov -8(up), %r10 + add -24(vp), %r8 + adc -16(vp), %r9 + adc -8(vp), %r10 + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + lea 24(rp), rp + +L(addx):inc n + jz L(ad3) + +L(addy):mov (up), %r8 + mov 8(up), %r9 + inc n + jmp L(mid) + +C ALIGN(16) +L(al3): adc (vp), %r8 + adc 8(vp), %r9 + adc 16(vp), %r10 + adc 24(vp), %r11 + mov %r8, (rp) + lea 32(up), up + mov %r9, 8(rp) + mov %r10, 16(rp) + inc n + mov %r11, 24(rp) + lea 32(vp), vp + mov (up), %r8 + mov 8(up), %r9 + lea 32(rp), rp +L(mid): mov 16(up), %r10 + mov 24(up), %r11 + jnz L(al3) + +L(ae3): adc (vp), %r8 + adc 8(vp), %r9 + adc 16(vp), %r10 + adc 24(vp), %r11 + mov %r8, (rp) + mov %r9, 8(rp) + mov %r10, 16(rp) + mov %r11, 24(rp) + +L(ad3): mov R32(n), R32(%rax) C zero + adc R32(%rax), R32(%rax) + +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + FUNC_EXIT() + ret +EPILOGUE() diff --git a/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm b/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm new file mode 100644 index 0000000..60cf945 --- /dev/null +++ b/vendor/gmp-6.3.0/mpn/x86_64/k8/sqr_basecase.asm @@ -0,0 +1,807 @@ +dnl AMD64 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C NOTES +C * There is a major stupidity in that we call mpn_mul_1 initially, for a +C large trip count. Instead, we should follow the generic/sqr_basecase.c +C code which uses addmul_2s from the start, conditionally leaving a 1x1 +C multiply to the end. (In assembly code, one would stop invoking +C addmul_2s loops when perhaps 3x2s respectively a 2x2s remains.) +C * Another stupidity is in the sqr_diag_addlsh1 code. It does not need to +C save/restore carry, instead it can propagate into the high product word. +C * Align more labels, should shave off a few cycles. +C * We can safely use 32-bit size operations, since operands with (2^32) +C limbs will lead to non-termination in practice. +C * The jump table could probably be optimized, at least for non-pic. +C * The special code for n <= 4 was quickly written. It is probably too +C large and unnecessarily slow. +C * Consider combining small cases code so that the n=k-1 code jumps into the +C middle of the n=k code. +C * Avoid saving registers for small cases code. +C * Needed variables: +C n r11 input size +C i r8 work left, initially n +C j r9 inner loop count +C r15 unused +C v0 r13 +C v1 r14 +C rp rdi +C up rsi +C w0 rbx +C w1 rcx +C w2 rbp +C w3 r10 +C tp r12 +C lo rax +C hi rdx +C rsp + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param', `%rdx') + +define(`n', `%r11') +define(`tp', `%r12') +define(`i', `%r8') +define(`j', `%r9') +define(`v0', `%r13') +define(`v1', `%r14') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_sqr_basecase) + FUNC_ENTRY(3) + mov R32(n_param), R32(%rcx) + mov R32(n_param), R32(n) C free original n register (rdx) + + add $-40, %rsp + + and $3, R32(%rcx) + cmp $4, R32(n_param) + lea 4(%rcx), %r8 + + mov %rbx, 32(%rsp) + mov %rbp, 24(%rsp) + mov %r12, 16(%rsp) + mov %r13, 8(%rsp) + mov %r14, (%rsp) + + cmovg %r8, %rcx + + lea L(tab)(%rip), %rax +ifdef(`PIC', +` movslq (%rax,%rcx,4), %r10 + add %r10, %rax + jmp *%rax +',` + jmp *(%rax,%rcx,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(4), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(0m4), L(tab)) + JMPENT( L(1m4), L(tab)) + JMPENT( L(2m4), L(tab)) + JMPENT( L(3m4), L(tab)) + TEXT + +L(1): mov (up), %rax + mul %rax + add $40, %rsp + mov %rax, (rp) + mov %rdx, 8(rp) + FUNC_EXIT() + ret + +L(2): mov (up), %rax + mov %rax, %r8 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, %r9 + mul %rax + add $40, %rsp + mov %rax, %r10 + mov %r11, %rax + mov %rdx, %r11 + mul %r8 + xor %r8, %r8 + add %rax, %r9 + adc %rdx, %r10 + adc %r8, %r11 + add %rax, %r9 + mov %r9, 8(rp) + adc %rdx, %r10 + mov %r10, 16(rp) + adc %r8, %r11 + mov %r11, 24(rp) + FUNC_EXIT() + ret + +L(3): mov (up), %rax + mov %rax, %r10 + mul %rax + mov 8(up), %r11 + mov %rax, (rp) + mov %r11, %rax + mov %rdx, 8(rp) + mul %rax + mov 16(up), %rcx + mov %rax, 16(rp) + mov %rcx, %rax + mov %rdx, 24(rp) + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + + mov %r11, %rax + mul %r10 + mov %rax, %r8 + mov %rcx, %rax + mov %rdx, %r9 + mul %r10 + xor %r10, %r10 + add %rax, %r9 + mov %r11, %rax + mov %r10, %r11 + adc %rdx, %r10 + + mul %rcx + add $40, %rsp + add %rax, %r10 + adc %r11, %rdx + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %rdx, %rdx + adc %r11, %r11 + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %rdx, 32(rp) + adc %r11, 40(rp) + FUNC_EXIT() + ret + +L(4): mov (up), %rax + mov %rax, %r11 + mul %rax + mov 8(up), %rbx + mov %rax, (rp) + mov %rbx, %rax + mov %rdx, 8(rp) + mul %rax + mov %rax, 16(rp) + mov %rdx, 24(rp) + mov 16(up), %rax + mul %rax + mov %rax, 32(rp) + mov %rdx, 40(rp) + mov 24(up), %rax + mul %rax + mov %rax, 48(rp) + mov %rbx, %rax + mov %rdx, 56(rp) + + mul %r11 + add $32, %rsp + mov %rax, %r8 + mov %rdx, %r9 + mov 16(up), %rax + mul %r11 + xor %r10, %r10 + add %rax, %r9 + adc %rdx, %r10 + mov 24(up), %rax + mul %r11 + xor %r11, %r11 + add %rax, %r10 + adc %rdx, %r11 + mov 16(up), %rax + mul %rbx + xor %rcx, %rcx + add %rax, %r10 + adc %rdx, %r11 + adc $0, %rcx + mov 24(up), %rax + mul %rbx + pop %rbx + add %rax, %r11 + adc %rdx, %rcx + mov 16(up), %rdx + mov 24(up), %rax + mul %rdx + add %rax, %rcx + adc $0, %rdx + + add %r8, %r8 + adc %r9, %r9 + adc %r10, %r10 + adc %r11, %r11 + adc %rcx, %rcx + mov $0, R32(%rax) + adc %rdx, %rdx + + adc %rax, %rax + add %r8, 8(rp) + adc %r9, 16(rp) + adc %r10, 24(rp) + adc %r11, 32(rp) + adc %rcx, 40(rp) + adc %rdx, 48(rp) + adc %rax, 56(rp) + FUNC_EXIT() + ret + + +L(0m4): + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax + lea (up,n,8), up C point up at end of input operand + + lea -4(n), i +C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1]) + xor R32(j), R32(j) + sub n, j + + mul v0 + xor R32(w2), R32(w2) + mov %rax, w0 + mov 16(up,j,8), %rax + mov %rdx, w3 + jmp L(L3) + + ALIGN(16) +L(mul_1_m3_top): + add %rax, w2 + mov w3, (tp,j,8) + mov (up,j,8), %rax + adc %rdx, w1 + xor R32(w0), R32(w0) + mul v0 + xor R32(w3), R32(w3) + mov w2, 8(tp,j,8) + add %rax, w1 + adc %rdx, w0 + mov 8(up,j,8), %rax + mov w1, 16(tp,j,8) + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov 16(up,j,8), %rax + adc %rdx, w3 +L(L3): xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov 24(up,j,8), %rax + adc %rdx, w2 + mov w0, 24(tp,j,8) + mul v0 + add $4, j + js L(mul_1_m3_top) + + add %rax, w2 + mov w3, (tp) + adc %rdx, w1 + mov w2, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + lea -8(up), up + jmp L(dowhile) + + +L(1m4): + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 + lea 8(up,n,8), up C point up at end of input operand + + lea -3(n), i +C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1) + lea -3(n), j + neg j + + mov %rax, v1 C u1 + mul v0 C u0 * u1 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov %rax, 8(rp) + jmp L(m0) + + ALIGN(16) +L(mul_2_m0_top): + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add %rax, w1 + mov w0, -24(tp,j,8) + adc %rdx, w2 +L(m0): mov -16(up,j,8), %rax C u2, u6 ... + mul v0 C u0 * u2 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(tp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(tp,j,8) + adc %rdx, w0 +L(m2x): mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, j + mov -32(up,j,8), %rax + mov w3, -32(tp,j,8) + js L(mul_2_m0_top) + + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, -8(tp) + mov w1, (tp) + + lea -16(up), up + lea eval(3*8-24)(tp), tp C tp += 3 + jmp L(dowhile_end) + + +L(2m4): + lea -16(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 + mov 8(up), %rax + lea (up,n,8), up C point up at end of input operand + + lea -4(n), i +C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i]) + lea -2(n), j + neg j + + mul v0 + mov %rax, w2 + mov (up,j,8), %rax + mov %rdx, w1 + jmp L(L1) + + ALIGN(16) +L(mul_1_m1_top): + add %rax, w2 + mov w3, (tp,j,8) + mov (up,j,8), %rax + adc %rdx, w1 +L(L1): xor R32(w0), R32(w0) + mul v0 + xor R32(w3), R32(w3) + mov w2, 8(tp,j,8) + add %rax, w1 + adc %rdx, w0 + mov 8(up,j,8), %rax + mov w1, 16(tp,j,8) + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov 16(up,j,8), %rax + adc %rdx, w3 + xor R32(w1), R32(w1) + mul v0 + add %rax, w3 + mov 24(up,j,8), %rax + adc %rdx, w2 + mov w0, 24(tp,j,8) + mul v0 + add $4, j + js L(mul_1_m1_top) + + add %rax, w2 + mov w3, (tp) + adc %rdx, w1 + mov w2, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + lea -8(up), up + jmp L(dowhile_mid) + + +L(3m4): + lea 8(rp,n,8), tp C point tp in middle of result operand + mov (up), v0 C u0 + mov 8(up), %rax C u1 + lea 8(up,n,8), up C point up at end of input operand + + lea -5(n), i +C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i) + lea -1(n), j + neg j + + mov %rax, v1 C u1 + mul v0 C u0 * u1 + mov %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov %rax, 8(rp) + jmp L(m2) + + ALIGN(16) +L(mul_2_m2_top): + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,j,8), %rax + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov -24(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add %rax, w1 + mov w0, -24(tp,j,8) + adc %rdx, w2 + mov -16(up,j,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,j,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(tp,j,8) + mul v1 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,j,8), %rax + adc %rdx, w3 + adc $0, R32(w0) + mul v1 + add %rax, w3 + mov w2, -8(tp,j,8) + adc %rdx, w0 +L(m2): mov (up,j,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + add $4, j + mov -32(up,j,8), %rax + mov w3, -32(tp,j,8) + js L(mul_2_m2_top) + + mul v1 + add %rax, w0 + adc %rdx, w1 + mov w0, -8(tp) + mov w1, (tp) + + lea -16(up), up + jmp L(dowhile_mid) + +L(dowhile): +C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i) + lea 4(i), j + neg j + + mov 16(up,j,8), v0 + mov 24(up,j,8), v1 + mov 24(up,j,8), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, 24(tp,j,8) + adc %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + jmp L(am2) + + ALIGN(16) +L(addmul_2_m2_top): + add w3, (tp,j,8) + adc %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add w0, 8(tp,j,8) + adc %rax, w1 + adc %rdx, w2 + mov 16(up,j,8), %rax + mov $0, R32(w3) + mul v0 C v0 * u1 + add %rax, w1 + mov 16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 C v1 * u1 + add w1, 16(tp,j,8) + adc %rax, w2 + mov 24(up,j,8), %rax + adc %rdx, w3 + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov 24(up,j,8), %rax + adc $0, R32(w0) + mul v1 + add w2, 24(tp,j,8) + adc %rax, w3 + adc %rdx, w0 +L(am2): mov 32(up,j,8), %rax + mul v0 + add %rax, w3 + mov 32(up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add $4, j + js L(addmul_2_m2_top) + + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 + + add $-2, R32(i) C i -= 2 + +L(dowhile_mid): +C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i) + lea 2(i), j + neg j + + mov (up,j,8), v0 + mov 8(up,j,8), v1 + mov 8(up,j,8), %rax + mul v0 + xor R32(w1), R32(w1) + add %rax, 8(tp,j,8) + adc %rdx, w1 + xor R32(w2), R32(w2) + jmp L(20) + + ALIGN(16) +L(addmul_2_m0_top): + add w3, (tp,j,8) + adc %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + mov $0, R32(w2) + mul v0 + add %rax, w0 + mov 8(up,j,8), %rax + adc %rdx, w1 + adc $0, R32(w2) + mul v1 C v1 * u0 + add w0, 8(tp,j,8) + adc %rax, w1 + adc %rdx, w2 +L(20): mov 16(up,j,8), %rax + mov $0, R32(w3) + mul v0 C v0 * u1 + add %rax, w1 + mov 16(up,j,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 C v1 * u1 + add w1, 16(tp,j,8) + adc %rax, w2 + mov 24(up,j,8), %rax + adc %rdx, w3 + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov 24(up,j,8), %rax + adc $0, R32(w0) + mul v1 + add w2, 24(tp,j,8) + adc %rax, w3 + adc %rdx, w0 + mov 32(up,j,8), %rax + mul v0 + add %rax, w3 + mov 32(up,j,8), %rax + adc %rdx, w0 + adc $0, R32(w1) + mul v1 + add $4, j + js L(addmul_2_m0_top) + + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + + lea eval(2*8)(tp), tp C tp += 2 +L(dowhile_end): + + add $-2, R32(i) C i -= 2 + jne L(dowhile) + +C Function mpn_addmul_2s_2 + mov -16(up), v0 + mov -8(up), v1 + mov -8(up), %rax + mul v0 + xor R32(w3), R32(w3) + add %rax, -8(tp) + adc %rdx, w3 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov (up), %rax + mul v0 + add %rax, w3 + mov (up), %rax + adc %rdx, w0 + mul v1 + add w3, (tp) + adc %rax, w0 + adc %rdx, w1 + mov w0, 8(tp) + mov w1, 16(tp) + +C Function mpn_sqr_diag_addlsh1 + lea -4(n,n), j + + mov 8(rp), %r11 + lea -8(up), up + lea (rp,j,8), rp + neg j + mov (up,j,4), %rax + mul %rax + test $2, R8(j) + jnz L(odd) + +L(evn): add %r11, %r11 + sbb R32(%rbx), R32(%rbx) C save CF + add %rdx, %r11 + mov %rax, (rp,j,8) + jmp L(d0) + +L(odd): add %r11, %r11 + sbb R32(%rbp), R32(%rbp) C save CF + add %rdx, %r11 + mov %rax, (rp,j,8) + lea -2(j), j + jmp L(d1) + + ALIGN(16) +L(top): mov (up,j,4), %rax + mul %rax + add R32(%rbp), R32(%rbp) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (rp,j,8) +L(d0): mov %r11, 8(rp,j,8) + mov 16(rp,j,8), %r10 + adc %r10, %r10 + mov 24(rp,j,8), %r11 + adc %r11, %r11 + nop + sbb R32(%rbp), R32(%rbp) C save CF + mov 8(up,j,4), %rax + mul %rax + add R32(%rbx), R32(%rbx) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, 16(rp,j,8) +L(d1): mov %r11, 24(rp,j,8) + mov 32(rp,j,8), %r10 + adc %r10, %r10 + mov 40(rp,j,8), %r11 + adc %r11, %r11 + sbb R32(%rbx), R32(%rbx) C save CF + add $4, j + js L(top) + + mov (up), %rax + mul %rax + add R32(%rbp), R32(%rbp) C restore carry + adc %rax, %r10 + adc %rdx, %r11 + mov %r10, (rp) + mov %r11, 8(rp) + mov 16(rp), %r10 + adc %r10, %r10 + sbb R32(%rbp), R32(%rbp) C save CF + neg R32(%rbp) + mov 8(up), %rax + mul %rax + add R32(%rbx), R32(%rbx) C restore carry + adc %rax, %r10 + adc %rbp, %rdx + mov %r10, 16(rp) + mov %rdx, 24(rp) + + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + FUNC_EXIT() + ret +EPILOGUE() -- cgit v1.2.3